seems pretty good. need to make hierarchies moreso...
[nltk_ontology_framework.git] / src / mjacob / ontologybuilder / simple_concepts.py
blob26a748145f50a191dd657fa068d2e2b1b0f3e839
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 5, 2011
5 @author: mjacob
6 '''
8 from itertools import chain
9 from framework import OntologyBuilderConcepts, OntologyBuilderFramework
10 import nltk
11 wn = nltk.corpus.wordnet
14 class Nyms(object):
15 HYPERNYMS = nltk.corpus.reader.wordnet.Synset.hypernyms
16 INSTANCE_HYPERNYMS = nltk.corpus.reader.wordnet.Synset.instance_hypernyms
18 HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
20 HYPONYMS = nltk.corpus.reader.wordnet.Synset.hyponyms
21 INSTANCE_HYPONYMS = nltk.corpus.reader.wordnet.Synset.instance_hyponyms
23 HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
25 MEMBER_HOLONYMS = nltk.corpus.reader.wordnet.Synset.member_holonyms
26 SUBSTANCE_HOLONYMS = nltk.corpus.reader.wordnet.Synset.substance_holonyms
27 PART_HOLONYMS = nltk.corpus.reader.wordnet.Synset.part_holonyms
29 HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
31 MEMBER_MERONYMS = nltk.corpus.reader.wordnet.Synset.member_meronyms
32 SUBSTANCE_MERONYMS = nltk.corpus.reader.wordnet.Synset.substance_meronyms
33 PART_MERONYMS = nltk.corpus.reader.wordnet.Synset.part_meronyms
35 MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
37 ATTRIBUTES = nltk.corpus.reader.wordnet.Synset.attributes
38 ENTAILMENTS = nltk.corpus.reader.wordnet.Synset.entailments
39 CAUSES = nltk.corpus.reader.wordnet.Synset.causes
40 ALSO_SEES = nltk.corpus.reader.wordnet.Synset.also_sees
41 VERB_GROUPS = nltk.corpus.reader.wordnet.Synset.verb_groups
42 SIMILAR_TOS = nltk.corpus.reader.wordnet.Synset.similar_tos
44 ALL = (HYPERNYMS
45 , INSTANCE_HYPERNYMS
46 , HYPONYMS
47 , INSTANCE_HYPONYMS
48 , MEMBER_HOLONYMS
49 , SUBSTANCE_HOLONYMS
50 , PART_HOLONYMS
51 , MEMBER_MERONYMS
52 , SUBSTANCE_MERONYMS
53 , PART_MERONYMS
54 , ATTRIBUTES
55 , ENTAILMENTS
56 , CAUSES
57 , ALSO_SEES
58 , VERB_GROUPS
59 , SIMILAR_TOS)
61 class SimpleConcepts(OntologyBuilderConcepts):
62 def _get_concepts(self, **state):
63 synonyms = state[OntologyBuilderFramework.SYNONYMS]
65 concepts = {}
66 for synonym in synonyms:
67 if len(synonym) < 2:
68 continue
70 concept = self.get_concept(synonym)
72 if concept:
73 concept = tuple(concept) # finalize it
74 print "%s means %s" % (synonyms[synonym], concept)
76 if concept in concepts:
77 concepts[concept].update(synonyms[synonym])
78 else:
79 concepts[concept] = set(synonyms[synonym])
80 else:
81 print "...... unmeaningful: %s" % (synonyms[synonym])
83 return concepts
85 def get_concept(self, synonym, debug=False):
86 initial_synset = self.__get_initial_synset(synonym, debug=debug)
87 if not initial_synset:
88 if debug:
89 print "no initial synset found"
90 return None
92 concept = [initial_synset]
94 for k in xrange(1, len(synonym)):
95 best_synset = self.__get_best_synset(synonym, k, debug=debug)
97 if not best_synset:
98 if debug:
99 print "no optimal synset found for %s" % (synonym[k])
100 return None
102 concept.append(best_synset)
104 return concept
106 def __get_initial_synset(self, synonym, debug=False):
107 """supposedly chosen manually? we don't have that time.."""
108 subterm_synsets = list(synonym[0])
109 scores = []
110 for subterm_synset in subterm_synsets:
111 score = self.__get_initial_score(subterm_synset, synonym)
112 scores.append(score)
114 if debug:
115 print subterm_synsets
116 print scores
118 index = self.__get_best_score_index(scores)
119 if index is not None:
120 return subterm_synsets[index]
121 else:
122 return None # we can't do anything w/ this
124 def __get_initial_score(self, synset, synonym):
125 for i in xrange(1, len(synonym)):
126 score, score_total = self.__get_initial_specific_score(synset, synonym, synonym[i])
127 if score_total != 0:
128 return score
130 return [0] # it's all 0s, we'll deal w/ that later
132 def __get_initial_specific_score(self, synset, synonym, other_synsets):
133 score = []
134 total_score = 0
135 for other_synset in other_synsets:
136 if type(other_synset) is not str and type(other_synset) is not unicode:
137 raise Exception("wtf is up w/ this type %s %s (%s)" % (other_synset, type(other_synset), synonym))
138 intersection_score = self.__get_intersection_score(synset, other_synset)
139 score.append(intersection_score)
140 total_score += intersection_score
141 return score, total_score
143 def __get_best_synset(self, synonym, k, debug=False):
144 subterm_synsets = list(synonym[k])
145 scores = []
146 for subterm_synset in subterm_synsets:
147 score = self.__get_score(subterm_synset, synonym, k)
148 scores.append(score)
150 if debug:
151 print subterm_synsets
152 print scores
154 index = self.__get_best_score_index(scores)
155 if index is not None:
156 return subterm_synsets[index]
157 else:
158 return None # we can't do anything w/ this
160 def __get_score(self, synset, synonym, k):
161 for i in xrange(k-1, -1, -1):
162 score, score_total = self.__get_specific_score(synset, synonym, synonym[i])
163 if score_total != 0:
164 return score
166 return [0] # it's all 0s, we'll deal w/ that later
168 def __get_specific_score(self, synset, synonym, other_synsets):
169 score = []
170 total_score = 0
171 for other_synset in other_synsets:
172 intersection_score = self.__get_intersection_score(other_synset, synset)
173 score.append(intersection_score)
174 total_score += intersection_score
175 return score, total_score
177 def __get_intersection_score(self, other_synset, synset):
178 S1 = wn.synset(other_synset)
179 S2 = wn.synset(synset)
180 score = 0
181 score += self.__get_colour_score(S1, S2)
182 score += self.__get_domain_score(S1, S2)
183 score += self.__get_synonymy_score(S1, S2)
184 score += self.__get_hypernymy_meronymy_path_score(S1, S2)
185 score += self.__get_hyponymy_holonymy_path_score(S1, S2)
186 score += self.__get_parallelism_score(S1, S2)
187 score += self.__get_gloss_score(S1, S2)
188 score += self.__get_topic_score(S1, S2)
189 score += self.__get_gloss_hyperonymy_meronymy_path_score(S1, S2)
190 score += self.__get_gloss_parallelism_score(S1, S2)
191 score += self.__get_gloss_gloss_score(S1, S2)
192 score += self.__get_hyperonymy_meronyomy_gloss_path_score(S1, S2)
193 score += self.__get_parallelism_gloss_score(S1, S2)
194 #print "intersection score: %s %s %s" % (other_synset, synset, score)
195 return score
197 CHROMATIC = wn.synset('chromatic.a.03')
198 PHYSICAL_ENTITY = wn.synset('physical_entity.n.01')
199 COLORS = frozenset(CHROMATIC.similar_tos())
201 def __get_colour_score(self, S1, S2):
202 if S1 in SimpleConcepts.COLORS and SimpleConcepts.PHYSICAL_ENTITY in chain(*S2.hypernym_paths()):
203 return 1
204 else:
205 return 0
207 def __get_domain_score(self, S1, S2):
208 import re
209 DOMAIN_LABEL = re.compile('\(of ([^\)]*)\)')
210 m = DOMAIN_LABEL.match(S1.definition)
211 if m:
212 domain_label_synsets = wn.synsets(m.group(1).replace(' ', '_'), pos=wn.NOUN)
213 score = 0
214 for domain_label_synset in domain_label_synsets:
215 if S2 == domain_label_synset or S2 in domain_label_synset.hyponyms():
216 score += 1
217 return score
218 return 0
220 def __glosses(self, synset):
221 l = list(synset.examples)
222 l.append(synset.definition)
223 for other in synset.similar_tos():
224 l.append(other.definition)
225 l.extend(other.examples)
226 return l
228 def __get_synonymy_score(self, S1, S2):
229 if S1 == S2 or S2 in chain(*[(b.synset for b in a.pertainyms()) for a in S1.lemmas]):
230 return 1
231 else:
232 return 0
234 def __net(self, synset, max_depth=3, *relations):
235 if max_depth < 1:
236 return []
238 if not relations:
239 relations = Nyms.ALL
241 connections = frozenset(chain(*[nym(synset) for nym in relations]))
243 if max_depth == 1:
244 return connections
246 else:
247 return frozenset(chain(connections,
248 chain(*[self.__net(connection, max_depth-1, *relations) for connection in connections])))
251 def __get_hypernymy_meronymy_path_score(self, S1, S2):
252 net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY, Nyms.MERONYMY))
253 net2 = self.__net(S2, 3, *chain(Nyms.HYPONYMY, Nyms.HOLONYMY))
254 if S1 in net2 or S2 in net1 or net1.intersection(net2):
255 return 1
256 else:
257 return 0
259 def __get_hyponymy_holonymy_path_score(self, S1, S2):
260 return self.__get_hypernymy_meronymy_path_score(S2, S1)
262 def __get_parallelism_score(self, S1, S2):
263 net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY))
264 net2 = self.__net(S2, 3, *chain(Nyms.HYPERNYMY))
265 if S1 in net2 or S2 in net1 or net1.intersection(net2):
266 return 1
267 else:
268 return 0
270 def __is_a_subsequence(self, a, b):
271 for j in xrange(len(b)-len(a)+1):
272 match = True
273 for i in xrange(len(a)):
274 if not a[i] == b[i+j]:
275 match=False
276 break
277 if match:
278 return True
279 return False
281 def __get_gloss_score(self, S1, S2):
282 for lemma in S2.lemmas:
283 for example in self.__glosses(S1):
284 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
285 return 1
286 for lemma in S1.lemmas:
287 for example in self.__glosses(S2):
288 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
289 return 1
290 return 0
292 def __get_topic_score(self, S1, S2):
293 return 0
295 def __get_gloss_hyperonymy_meronymy_path_score(self, S1, S2):
296 for example in self.__glosses(S1):
297 for word in example.split(' '):
298 for synset in wn.synsets(word):
299 if (self.__get_hypernymy_meronymy_path_score(synset, S2)
300 or self.__get_hyponymy_holonymy_path_score(synset, S2)):
301 return 1
302 return 0
304 def __get_gloss_parallelism_score(self, S1, S2):
305 for example in self.__glosses(S1):
306 for word in example.split(' '):
307 for synset in wn.synsets(word):
308 if self.__get_parallelism_score(synset, S2):
309 return 1
310 return 0
312 def __get_gloss_gloss_score(self, S1, S2):
313 synsets_lists1 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S1))))
314 synsets_lists2 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S2))))
315 for synsets in synsets_lists1:
316 if synsets in synsets_lists2:
317 return 1
319 return 0
321 def __get_hyperonymy_meronyomy_gloss_path_score(self, S1, S2):
322 return self.__get_gloss_hyperonymy_meronymy_path_score(S2, S1)
324 def __get_parallelism_gloss_score(self, S1, S2):
325 return self.__get_gloss_parallelism_score(S2, S1)
327 def __get_best_score_index(self, scores):
328 best_score_index = None
329 for i in xrange(len(scores)):
330 score = scores[i]
331 if score:
332 if best_score_index is None:
333 if len(filter(lambda x: x != 0, score)) != 0:
334 best_score_index = i
335 elif self.__is_better_score(score, scores[best_score_index]):
336 best_score_index = i
337 return best_score_index
339 def __is_better_score(self, score_a, score_b):
340 return self.__value(score_a) > self.__value(score_b)
342 def __value(self, score):
343 # dictionary order gives weird results.
344 return sum(score) + 3*len(filter(lambda x: x != 0, score))