gonna delete stuff
[nltk_ontology_framework.git] / src / mjacob / ontologybuilder / simple_concepts.py
blob4a6ee7e631111365363f3e7a323d7bfc4588e25d
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 5, 2011
5 @author: mjacob
6 '''
8 from itertools import chain, combinations
9 from framework import OntologyBuilderConcepts, OntologyBuilderFramework
10 import nltk
11 wn = nltk.corpus.wordnet
14 class Nyms(object):
15 HYPERNYMS = nltk.corpus.reader.wordnet.Synset.hypernyms
16 INSTANCE_HYPERNYMS = nltk.corpus.reader.wordnet.Synset.instance_hypernyms
17 HYPONYMS = nltk.corpus.reader.wordnet.Synset.hyponyms
18 INSTANCE_HYPONYMS = nltk.corpus.reader.wordnet.Synset.instance_hyponyms
19 MEMBER_HOLONYMS = nltk.corpus.reader.wordnet.Synset.member_holonyms
20 SUBSTANCE_HOLOYNMS = nltk.corpus.reader.wordnet.Synset.substance_holonyms
21 PART_HOLONYMS = nltk.corpus.reader.wordnet.Synset.part_holonyms
22 MEMBER_MERONYMS = nltk.corpus.reader.wordnet.Synset.member_meronyms
23 SUBSTANCE_MERONYMS = nltk.corpus.reader.wordnet.Synset.substance_meronyms
24 PART_MERONYMS = nltk.corpus.reader.wordnet.Synset.part_meronyms
25 ATTRIBUTES = nltk.corpus.reader.wordnet.Synset.attributes
26 ENTAILMENTS = nltk.corpus.reader.wordnet.Synset.entailments
27 CAUSES = nltk.corpus.reader.wordnet.Synset.causes
28 ALSO_SEES = nltk.corpus.reader.wordnet.Synset.also_sees
29 VERB_GROUPS = nltk.corpus.reader.wordnet.Synset.verb_groups
30 SIMILAR_TOS = nltk.corpus.reader.wordnet.Synset.similar_tos
31 ALL = (HYPERNYMS
32 , INSTANCE_HYPERNYMS
33 , HYPONYMS
34 , INSTANCE_HYPONYMS
35 , MEMBER_HOLONYMS
36 , SUBSTANCE_HOLOYNMS
37 , PART_HOLONYMS
38 , MEMBER_MERONYMS
39 , SUBSTANCE_MERONYMS
40 , PART_MERONYMS
41 , ATTRIBUTES
42 , ENTAILMENTS
43 , CAUSES
44 , ALSO_SEES
45 , VERB_GROUPS
46 , SIMILAR_TOS)
48 class SimpleConcepts(OntologyBuilderConcepts):
49 def _get_concepts(self, **state):
50 synonyms = state[OntologyBuilderFramework.SYNONYMS]
52 concepts = {}
53 for synonym in synonyms:
54 #nets = self.__get_subterm_nets(synonym)
56 concept = [self.__get_initial_synset(synonym)]
58 no_synset = False
60 for k in xrange(1, len(synonym)):
61 best_synset = self.__get_best_synset(synonym, k)
63 if not best_synset:
64 no_synset = True
65 break
67 concept.append(best_synset)
69 if not no_synset and concept in concepts:
70 concepts[concept].extend(synonyms[synonym])
72 return concepts
74 def __get_initial_synset(self, synonym):
75 # FIXME supposedly chosen manually.
76 raise Exception()
78 def __get_best_synset(self, synonym, k):
79 subterm_synsets = synonym[k]
80 scores = []
81 for j in xrange(len(subterm_synsets)):
82 subterm_synset = subterm_synsets[j]
83 score = self.__get_score(subterm_synset, synonym, k)
84 scores.append(score)
85 index = self.__get_best_score_index(scores)
86 if index:
87 return synonym[index]
88 else:
89 return None # we can't do anything w/ this
91 def __get_score(self, synset, synonym, k):
92 for i in xrange(k-1, -1, -1):
93 score, score_total = self.__get_specific_score(self, synset, synonym, k, i)
94 if score_total != 0:
95 return score
97 return score # it's all 0s, we'll deal w/ that later
99 def __get_specific_score(self, synset, synonym, k, i):
100 score = []
101 total_score = 0
102 other_synsets = synonym[i]
103 for other_synset in other_synsets:
104 intersection_score = self.__get_intersection_score(other_synset, synset)
105 score.append(intersection_score)
106 total_score += intersection_score
107 return score, total_score
109 def __get_intersection_score(self, other_synset, synset):
110 S1 = wn.synset(other_synset)
111 S2 = wn.synset(synset)
112 score = 0
113 score += self.__get_colour_score(S1, S2)
114 score += self.__get_domain_score(S1, S2)
115 score += self.__get_synonymy_score(S1, S2)
116 score += self.__get_hypernymy_meronymy_path_score(S1, S2)
117 score += self.__get_hyponymy_holonymy_path_score(S1, S2)
118 score += self.__get_parallelism_score(S1, S2)
119 score += self.__get_gloss_score(S1, S2)
120 score += self.__get_topic_score(S1, S2)
121 score += self.__get_gloss_hyperonymy_meronymy_path_score(S1, S2)
122 score += self.__get_gloss_parallelism_score(S1, S2)
123 score += self.__get_gloss_gloss_score(S1, S2)
124 score += self.__get_hyperonymy_meronyomy_gloss_path_score(S1, S2)
125 score += self.__get_parallelism_gloss_score(S1, S2)
126 return score
128 def __get_colour_score(self, S1, S2):
129 CHROMATIC = wn.synset('cromatic.a.3')
130 PHYSICAL_ENTITY = wn.synset('physical_entity.n.01')
131 COLORS = frozenset(CHROMATIC.similar_tos())
132 if S1 in COLORS and PHYSICAL_ENTITY in chain(*S2.hypernym_paths()):
133 return 1
134 else:
135 return 0
137 def __get_domain_score(self, S1, S2):
138 import re
139 DOMAIN_LABEL = re.compile('\(of ([^\)]*)\)')
140 m = DOMAIN_LABEL.match(S1.definition)
141 if m:
142 domain_label_synsets = wn.synsets(m.group(1).replace(' ', '_'), pos=wn.NOUN)
143 score = 0
144 for domain_label_synset in domain_label_synsets:
145 if S2 == domain_label_synset or S2 in domain_label_synset.hyponyms():
146 score += 1
147 return score
148 return 0
150 def __get_synonymy_score(self, S1, S2):
151 if S1 == S2 or S2 in chain(*[(b.synset for b in a.pertainyms()) for a in S1.lemmas]):
152 return 1
153 else:
154 return 0
156 def __path(self, synset, max_depth=3, *relations):
157 if max_depth < 1:
158 return []
160 if not relations:
161 relations = Nyms.ALL
162 connections = chain(*[nym(synset) for nym in relations])
163 if max_depth == 1:
164 return connections
165 else:
166 return chain(connections,
167 chain(*[nyms(connection, max_depth-1) for connection in connections]))
170 def __get_hypernymy_meronymy_path_score(self, S1, S2):
171 raise Exception()
172 def __get_hyponymy_holonymy_path_score(self, S1, S2):
173 raise Exception()
174 def __get_parallelism_score(self, S1, S2):
175 raise Exception()
176 def __get_gloss_score(self, S1, S2):
177 raise Exception()
178 def __get_topic_score(self, S1, S2):
179 raise Exception()
180 def __get_gloss_hyperonymy_meronymy_path_score(self, S1, S2):
181 raise Exception()
182 def __get_gloss_parallelism_score(self, S1, S2):
183 raise Exception()
184 def __get_gloss_gloss_score(self, S1, S2):
185 raise Exception()
186 def __get_hyperonymy_meronyomy_gloss_path_score(self, S1, S2):
187 raise Exception()
188 def __get_parallelism_gloss_score(self, S1, S2):
189 raise Exception()
191 def __get_best_score_index(self, scores):
192 best_score_index = None
193 for i in xrange(len(scores)):
194 score = scores[i]
195 if not best_score_index:
196 if score[-1] != 0:
197 best_score_index = i
198 elif self.__is_better_score(scores[i], scores[best_score_index]):
199 best_score_index = i
200 return best_score_index
202 def __is_better_score(self, score_a, score_b):
203 return sorted(score_a) > sorted(score_b)
205 def random_code_hole(self, nets): # FIXME delete
206 for i in xrange(1, len(nets)):
207 subterm_nets_a = nets[i]
208 score_vectors = {}
209 for net_a in subterm_nets_a:
210 for j in xrange(i-1, -1, -1):
211 subterm_nets_b = nets[j]
212 scores = []
213 total = 0
214 for net_b in subterm_nets_b:
215 net_i = net_a.intersection(net_b)
216 score = self.__score(net_i)
217 scores.append[score]
218 total += score
219 if total != 0:
220 "stuff"
221 break
223 def __get_subterm_nets(self, synonym):
224 return [[frozenset(self.__nyms(wn.synset(synset), 3))
225 for synset in subterm_synonym]
226 for subterm_synonym in synonym]
228 def __nyms(self, synset, n=1):
229 connections = chain(synset.hypernyms(),
230 synset.instance_hypernyms(),
231 synset.hyponyms(),
232 synset.instance_hyponyms(),
233 synset.member_holonyms(),
234 synset.substance_holonyms(),
235 synset.part_holonyms(),
236 synset.member_meronyms(),
237 synset.substance_meronyms(),
238 synset.part_meronyms(),
239 synset.attributes(),
240 synset.entailments(),
241 synset.causes(),
242 synset.also_sees(),
243 synset.verb_groups(),
244 synset.similar_tos())
245 if n == 1:
246 return connections
247 else:
248 return chain(connections,
249 chain(*[self.__nyms(connection, n-1) for connection in connections]))
254 from itertools import chain
255 def nyms(synset, n=1):
256 connections = chain(synset.hypernyms(),
257 synset.instance_hypernyms(),
258 synset.hyponyms(),
259 synset.instance_hyponyms(),
260 synset.member_holonyms(),
261 synset.substance_holonyms(),
262 synset.part_holonyms(),
263 synset.member_meronyms(),
264 synset.substance_meronyms(),
265 synset.part_meronyms(),
266 synset.attributes(),
267 synset.entailments(),
268 synset.causes(),
269 synset.also_sees(),
270 synset.verb_groups(),
271 synset.similar_tos())
272 if n == 1:
273 return connections
274 else:
275 return chain(connections,
276 chain(*[nyms(connection, n-1) for connection in connections]))
278 def nyms2(synset):
279 for collection in (synset.hypernyms(),
280 synset.instance_hypernyms(),
281 synset.hyponyms(),
282 synset.instance_hyponyms(),
283 synset.member_holonyms(),
284 synset.substance_holonyms(),
285 synset.part_holonyms(),
286 synset.member_meronyms(),
287 synset.substance_meronyms(),
288 synset.part_meronyms(),
289 synset.attributes(),
290 synset.entailments(),
291 synset.causes(),
292 synset.also_sees(),
293 synset.verb_groups(),
294 synset.similar_tos()):
295 print collection