nearly everything good to go, mod minor hierarchy bug
[nltk_ontology_framework.git] / src / mjacob / ontologybuilder / simple_concepts.py
blob0ce0d8fe7d23fb434556faad54cfebffc5f4fa8e
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 5, 2011
5 @author: mjacob
6 '''
8 from itertools import chain
9 from framework import OntologyBuilderConcepts, OntologyBuilderFramework
10 import nltk
11 from util.cached import Cached
12 from util.memoized import Memoized
13 wn = nltk.corpus.wordnet # this is to subvert a bug in my IDE
14 import re
16 DOMAIN_LABEL = re.compile('\(of ([^\)]*)\)')
17 NOMINAL_SYNSET = re.compile('\w+\.n\.\d+$')
19 class Nyms(object):
20 """
21 Shortcuts for referring to particular relationships in wordnet
23 HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.hypernyms}
24 INSTANCE_HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hypernyms}
26 HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
28 HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.hyponyms}
29 INSTANCE_HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hyponyms}
31 HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
33 MEMBER_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.member_holonyms}
34 SUBSTANCE_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_holonyms}
35 PART_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.part_holonyms}
37 HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
39 MEMBER_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.member_meronyms}
40 SUBSTANCE_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_meronyms}
41 PART_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.part_meronyms}
43 MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
45 ATTRIBUTES = C{nltk.corpus.reader.wordnet.Synset.attributes}
46 ENTAILMENTS = C{nltk.corpus.reader.wordnet.Synset.entailments}
47 CAUSES = C{nltk.corpus.reader.wordnet.Synset.causes}
48 ALSO_SEES = C{nltk.corpus.reader.wordnet.Synset.also_sees}
49 VERB_GROUPS = C{nltk.corpus.reader.wordnet.Synset.verb_groups}
50 SIMILAR_TOS = C{nltk.corpus.reader.wordnet.Synset.similar_tos}
52 ALL = all of them
53 """
54 HYPERNYMS = nltk.corpus.reader.wordnet.Synset.hypernyms
55 INSTANCE_HYPERNYMS = nltk.corpus.reader.wordnet.Synset.instance_hypernyms
57 HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
59 HYPONYMS = nltk.corpus.reader.wordnet.Synset.hyponyms
60 INSTANCE_HYPONYMS = nltk.corpus.reader.wordnet.Synset.instance_hyponyms
62 HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
64 MEMBER_HOLONYMS = nltk.corpus.reader.wordnet.Synset.member_holonyms
65 SUBSTANCE_HOLONYMS = nltk.corpus.reader.wordnet.Synset.substance_holonyms
66 PART_HOLONYMS = nltk.corpus.reader.wordnet.Synset.part_holonyms
68 HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
70 MEMBER_MERONYMS = nltk.corpus.reader.wordnet.Synset.member_meronyms
71 SUBSTANCE_MERONYMS = nltk.corpus.reader.wordnet.Synset.substance_meronyms
72 PART_MERONYMS = nltk.corpus.reader.wordnet.Synset.part_meronyms
74 MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
76 ATTRIBUTES = nltk.corpus.reader.wordnet.Synset.attributes
77 ENTAILMENTS = nltk.corpus.reader.wordnet.Synset.entailments
78 CAUSES = nltk.corpus.reader.wordnet.Synset.causes
79 ALSO_SEES = nltk.corpus.reader.wordnet.Synset.also_sees
80 VERB_GROUPS = nltk.corpus.reader.wordnet.Synset.verb_groups
81 SIMILAR_TOS = nltk.corpus.reader.wordnet.Synset.similar_tos
83 ALL = (HYPERNYMS
84 , INSTANCE_HYPERNYMS
85 , HYPONYMS
86 , INSTANCE_HYPONYMS
87 , MEMBER_HOLONYMS
88 , SUBSTANCE_HOLONYMS
89 , PART_HOLONYMS
90 , MEMBER_MERONYMS
91 , SUBSTANCE_MERONYMS
92 , PART_MERONYMS
93 , ATTRIBUTES
94 , ENTAILMENTS
95 , CAUSES
96 , ALSO_SEES
97 , VERB_GROUPS
98 , SIMILAR_TOS)
100 class SimpleConcepts(OntologyBuilderConcepts):
102 Generate concepts for terms out of collections of wordnet synsets.
104 The basic idea is taken from [Navigli and Velardi 2002], though it's simplified somewhat.
106 Concepts are basically generated by doing a primitive sort of lexical
107 disambiguation, by computing scores for a subterm based upon the presence
108 of specific connections in wordnet of a subterm and the subterm preceding it.
110 In [Navigli and Velardi 2002], the synset for the initial subterm was
111 chosen manually, but this algorithm instead tries to match it against
112 the subterm following it. It seems to work well enough for the purposes
113 of demonstration.
116 @Cached(lambda *x, **y: OntologyBuilderFramework.CONCEPTS)
117 def _get_concepts(self, **state):
119 given a collection of synonyms (bundles of synsets per subterm),
120 disambiguate the synsets and return a single synsets per subterm.
122 12 specific structural relations mentioned in [Navigli and Velardi 2002]
123 are used in the process of disambiguation.
125 synonyms = state[OntologyBuilderFramework.SYNONYMS]
127 concepts = {}
128 for synonym in synonyms:
129 if len(synonym) < 2:
130 continue
132 """make sure that the head of the concept is nominal"""
133 nominal_concept = self.__ensure_nominal(synonym)
134 if not nominal_concept[-1]:
135 continue # if not a nominal concept, just skip it
137 concept = self.get_concept(nominal_concept)
139 if concept:
140 concept = tuple(concept) # finalize it
141 print "%s means %s" % (synonyms[synonym], concept)
143 if concept in concepts:
144 concepts[concept].update(synonyms[synonym])
145 else:
146 concepts[concept] = set(synonyms[synonym])
147 else:
148 print "...... unmeaningful: %s" % (synonyms[synonym])
150 return concepts
152 def __ensure_nominal(self, synonym):
153 """filters out non-nominal synsets from the head of the term"""
154 filtered_end = filter(NOMINAL_SYNSET.match, synonym[-1])
155 return tuple(chain(synonym[:-1], [filtered_end]))
157 def get_concept(self, synonym, debug=False):
159 disambiguate a specific synonym.
161 initial_synset = self.__get_initial_synset(synonym, debug=debug)
162 if not initial_synset:
163 if debug:
164 print "no initial synset found"
165 return None
167 concept = [initial_synset]
169 for k in xrange(1, len(synonym)):
170 best_synset = self.__get_best_synset(synonym, k, debug=debug)
172 if not best_synset:
173 if debug:
174 print "no optimal synset found for %s" % (synonym[k])
175 return None
177 concept.append(best_synset)
179 return concept
181 def __get_initial_synset(self, synonym, debug=False):
183 compares the synsets for the initial subterm to those coming after it
184 to attempt to disambiguate the sense of the subterm.
186 subterm_synsets = list(synonym[0])
187 scores = []
188 for subterm_synset in subterm_synsets:
189 score = self.__get_initial_score(subterm_synset, synonym)
190 scores.append(score)
192 if debug:
193 print subterm_synsets
194 print scores
196 index = self.__get_best_score_index(scores)
197 if index is not None:
198 return subterm_synsets[index]
199 else:
200 return None # we can't do anything w/ this
202 def __get_initial_score(self, synset, synonym):
204 get the score for a particular initial synset.
205 If no matching synsets are found at all, returns C{[0]}
207 for i in xrange(1, len(synonym)):
208 score, score_total = self.__get_initial_specific_score(synset, synonym[i])
209 if score_total != 0:
210 return score
212 return [0] # it's all 0s, we'll dump the term later
214 def __get_initial_specific_score(self, synset, other_synsets):
216 get the initial score of a synset compared to each of a collection of other synsets
218 score = [self.__get_intersection_score(synset, other_synset)
219 for other_synset in other_synsets]
220 return score, sum(score)
222 def __get_best_synset(self, synonym, k, debug=False):
224 disambiguate the synsets of a subterm by looking for particular relations in wordnet
225 between them and the the synsets of the preceding subterm.
227 returns the best synset, or None if no relationships are found.
229 subterm_synsets = list(synonym[k])
230 scores = []
231 for subterm_synset in subterm_synsets:
232 score = self.__get_score(subterm_synset, synonym, k)
233 scores.append(score)
235 if debug:
236 print subterm_synsets
237 print scores
239 index = self.__get_best_score_index(scores)
240 if index is not None:
241 return subterm_synsets[index]
242 else:
243 return None # we can't do anything w/ this
245 def __get_score(self, synset, synonym, k):
247 get the score of a C{synset} compared to all of the synsets
248 at subterm C{k-1}
250 if no matches are found at C{k-1}, try C{k-2} all the way to C{0}
252 for i in xrange(k-1, -1, -1):
253 score, score_total = self.__get_specific_score(synset, synonym, synonym[i])
254 if score_total != 0:
255 return score
257 return [0] # it's all 0s, we'll deal w/ that later
259 def __get_specific_score(self, synset, synonym, other_synsets):
261 get the score of a synset compared to each of a collection of other synsets
263 score = [self.__get_intersection_score(other_synset, synset)
264 for other_synset in other_synsets]
265 return score, sum(score)
267 @Memoized
268 def __get_intersection_score(self, other_synset, synset):
270 count the number of semantically meaningful relationships in wordnet
271 as specified in [Navigli and Velardi 2002]
273 S1 = wn.synset(other_synset)
274 S2 = wn.synset(synset)
275 score = 0
276 score += self.__get_colour_score(S1, S2)
277 score += self.__get_domain_score(S1, S2)
278 score += self.__get_synonymy_score(S1, S2)
279 score += self.__get_hypernymy_meronymy_path_score(S1, S2)
280 score += self.__get_hyponymy_holonymy_path_score(S1, S2)
281 score += self.__get_parallelism_score(S1, S2)
282 score += self.__get_gloss_score(S1, S2)
283 score += self.__get_gloss_hyperonymy_meronymy_path_score(S1, S2)
284 score += self.__get_gloss_parallelism_score(S1, S2)
285 score += self.__get_gloss_gloss_score(S1, S2)
286 score += self.__get_hyperonymy_meronyomy_gloss_path_score(S1, S2)
287 score += self.__get_parallelism_gloss_score(S1, S2)
288 #print "intersection score: %s %s %s" % (other_synset, synset, score)
289 return score
291 CHROMATIC = wn.synset('chromatic.a.03')
292 PHYSICAL_ENTITY = wn.synset('physical_entity.n.01')
293 COLORS = frozenset(CHROMATIC.similar_tos())
295 def __get_colour_score(self, S1, S2):
297 if S1 is a color and S2 is a physical object, return 1. else return 0.
299 if S1 in SimpleConcepts.COLORS and SimpleConcepts.PHYSICAL_ENTITY in chain(*S2.hypernym_paths()):
300 return 1
301 else:
302 return 0
304 def __get_domain_score(self, S1, S2):
306 if the definition of S1 is of the form that it indicates a property
307 of something specific, see if S2 is that type of thing.
309 m = DOMAIN_LABEL.match(S1.definition)
310 if m:
311 domain_label_synsets = wn.synsets(m.group(1).replace(' ', '_'), pos=wn.NOUN)
312 score = 0
313 for domain_label_synset in domain_label_synsets:
314 if S2 == domain_label_synset or S2 in domain_label_synset.hyponyms():
315 score += 1
316 return score
317 return 0
319 @Memoized
320 def __glosses(self, synset):
322 return a list of all definitions and examples of the synset,
323 along with all definitions and examples of anything similar to it.
325 l = list(synset.examples)
326 l.append(synset.definition)
327 for other in synset.similar_tos():
328 l.append(other.definition)
329 l.extend(other.examples)
330 return l
332 def __get_synonymy_score(self, S1, S2):
334 If S1 and S2 are synonyms, return 1. else 0.
336 if S1 == S2 or S2 in chain(*[(b.synset for b in a.pertainyms()) for a in S1.lemmas]):
337 return 1
338 else:
339 return 0
341 @Memoized
342 def __net(self, synset, max_depth=3, *relations):
344 finds all connected synsets of @param synset of the kind specified in @param relations,
345 up to @param max_depth, which has a default of 3.
347 if max_depth < 1:
348 return []
350 if not relations:
351 relations = Nyms.ALL
353 connections = frozenset(chain(*[nym(synset) for nym in relations]))
355 if max_depth == 1:
356 return connections
358 else:
359 return frozenset(chain(connections,
360 chain(*[self.__net(connection, max_depth-1, *relations) for connection in connections])))
363 def __get_hypernymy_meronymy_path_score(self, S1, S2):
365 If something above S1 and something below S2 are identical,
366 return 1, otherwise return 0.
368 net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY, Nyms.MERONYMY))
369 net2 = self.__net(S2, 3, *chain(Nyms.HYPONYMY, Nyms.HOLONYMY))
370 if S1 in net2 or S2 in net1 or net1.intersection(net2):
371 return 1
372 else:
373 return 0
375 def __get_hyponymy_holonymy_path_score(self, S1, S2):
377 If something above S2 and something below S1 are identical,
378 return 1, otherwise return 0.
380 return self.__get_hypernymy_meronymy_path_score(S2, S1)
382 def __get_parallelism_score(self, S1, S2):
384 If S1 and S2 share something above them, return 1, else return 0
386 net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY))
387 net2 = self.__net(S2, 3, *chain(Nyms.HYPERNYMY))
388 if S1 in net2 or S2 in net1 or net1.intersection(net2):
389 return 1
390 else:
391 return 0
393 def __is_a_subsequence(self, a, b):
395 returns true if @param a is a subsequence of @param b
397 for j in xrange(len(b)-len(a)+1):
398 match = True
399 for i in xrange(len(a)):
400 if not a[i] == b[i+j]:
401 match=False
402 break
403 if match:
404 return True
405 return False
407 def __get_gloss_score(self, S1, S2):
409 if a word representing S1 is mentioned in the glosses of S2 or vice versa,
410 return 1, otherwise return 0
412 for lemma in S2.lemmas:
413 for example in self.__glosses(S1):
414 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
415 return 1
416 for lemma in S1.lemmas:
417 for example in self.__glosses(S2):
418 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
419 return 1
420 return 0
422 def __get_gloss_hyperonymy_meronymy_path_score(self, S1, S2):
424 if something in the gloss of S1 is above or below S2, return 1
425 otherwise 0
427 for example in self.__glosses(S1):
428 for word in example.split(' '):
429 for synset in wn.synsets(word):
430 if (self.__get_hypernymy_meronymy_path_score(synset, S2)
431 or self.__get_hyponymy_holonymy_path_score(synset, S2)):
432 return 1
433 return 0
435 def __get_gloss_parallelism_score(self, S1, S2):
437 if there is a common term above S2 and a word in the glosses of S1,
438 return 1, otherwise 0
440 for example in self.__glosses(S1):
441 for word in example.split(' '):
442 for synset in wn.synsets(word):
443 if self.__get_parallelism_score(synset, S2):
444 return 1
445 return 0
447 def __get_gloss_gloss_score(self, S1, S2):
449 If a word in the glosses of S1 shares a synset with a word in the glosses of S2,
450 return 1, otherwise 0.
452 synsets_lists1 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S1))))
453 synsets_lists2 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S2))))
454 for synsets in synsets_lists1:
455 if synsets in synsets_lists2:
456 return 1
458 return 0
460 def __get_hyperonymy_meronyomy_gloss_path_score(self, S1, S2):
462 if something in the gloss of S2 is above or below S1, return 1
463 otherwise 0
465 return self.__get_gloss_hyperonymy_meronymy_path_score(S2, S1)
467 def __get_parallelism_gloss_score(self, S1, S2):
469 if there is a common term above S1 and a word in the glosses of S2,
470 return 1, otherwise 0
472 return self.__get_gloss_parallelism_score(S2, S1)
474 def __get_best_score_index(self, scores):
476 given a set of scores, return the index of the "best" one
478 scores are assumed to be lists of numbers
480 best_score_index = None
481 for i in xrange(len(scores)):
482 score = scores[i]
483 if score:
484 if best_score_index is None:
485 if len(filter(lambda x: x != 0, score)) != 0:
486 best_score_index = i
487 elif self.__is_better_score(score, scores[best_score_index]):
488 best_score_index = i
489 return best_score_index
491 def __is_better_score(self, score_a, score_b):
493 returns true if score_a is better than score_b
495 return self.__value(score_a) > self.__value(score_b)
497 def __value(self, score):
499 [Navigli and Velardi 2002] mention some sort of "lexicographic ordering" of scores
500 but don't specify exactly what they mean by this, and my attempts at recreating it
501 produced incredibly weird results. Thus, I've opted for something a little simpler
502 that seems to work well enough in practice.
504 the "value" of a score is considered to be 3* the number of non-0 subscores in it,
505 plus the sum total of the subscores.
506 thus, having two scores of "1" results in a total score of 8, which is equivalent
507 to a single score of "5"
509 # dictionary order gives weird results.
510 return sum(score) + 3*len(filter(lambda x: x != 0, score))