1 # This Python file uses the following encoding: utf-8
8 from itertools
import chain
9 from framework
import OntologyBuilderConcepts
, OntologyBuilderFramework
11 wn
= nltk
.corpus
.wordnet
15 HYPERNYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.hypernyms
16 INSTANCE_HYPERNYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.instance_hypernyms
18 HYPERNYMY
= (HYPERNYMS
, INSTANCE_HYPERNYMS
)
20 HYPONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.hyponyms
21 INSTANCE_HYPONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.instance_hyponyms
23 HYPONYMY
= (HYPONYMS
, INSTANCE_HYPONYMS
)
25 MEMBER_HOLONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.member_holonyms
26 SUBSTANCE_HOLONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.substance_holonyms
27 PART_HOLONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.part_holonyms
29 HOLONYMY
= (MEMBER_HOLONYMS
, SUBSTANCE_HOLONYMS
, PART_HOLONYMS
)
31 MEMBER_MERONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.member_meronyms
32 SUBSTANCE_MERONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.substance_meronyms
33 PART_MERONYMS
= nltk
.corpus
.reader
.wordnet
.Synset
.part_meronyms
35 MERONYMY
= (MEMBER_MERONYMS
, SUBSTANCE_MERONYMS
, PART_MERONYMS
)
37 ATTRIBUTES
= nltk
.corpus
.reader
.wordnet
.Synset
.attributes
38 ENTAILMENTS
= nltk
.corpus
.reader
.wordnet
.Synset
.entailments
39 CAUSES
= nltk
.corpus
.reader
.wordnet
.Synset
.causes
40 ALSO_SEES
= nltk
.corpus
.reader
.wordnet
.Synset
.also_sees
41 VERB_GROUPS
= nltk
.corpus
.reader
.wordnet
.Synset
.verb_groups
42 SIMILAR_TOS
= nltk
.corpus
.reader
.wordnet
.Synset
.similar_tos
61 class SimpleConcepts(OntologyBuilderConcepts
):
62 def _get_concepts(self
, **state
):
63 synonyms
= state
[OntologyBuilderFramework
.SYNONYMS
]
66 for synonym
in synonyms
:
70 concept
= self
.get_concept(synonym
)
73 concept
= tuple(concept
) # finalize it
74 print "%s means %s" % (synonyms
[synonym
], concept
)
76 if concept
in concepts
:
77 concepts
[concept
].update(synonyms
[synonym
])
79 concepts
[concept
] = set(synonyms
[synonym
])
81 print "...... unmeaningful: %s" % (synonyms
[synonym
])
85 def get_concept(self
, synonym
, debug
=False):
86 initial_synset
= self
.__get
_initial
_synset
(synonym
, debug
=debug
)
87 if not initial_synset
:
89 print "no initial synset found"
92 concept
= [initial_synset
]
94 for k
in xrange(1, len(synonym
)):
95 best_synset
= self
.__get
_best
_synset
(synonym
, k
, debug
=debug
)
99 print "no optimal synset found for %s" % (synonym
[k
])
102 concept
.append(best_synset
)
106 def __get_initial_synset(self
, synonym
, debug
=False):
107 """supposedly chosen manually? we don't have that time.."""
108 subterm_synsets
= list(synonym
[0])
110 for subterm_synset
in subterm_synsets
:
111 score
= self
.__get
_initial
_score
(subterm_synset
, synonym
)
115 print subterm_synsets
118 index
= self
.__get
_best
_score
_index
(scores
)
119 if index
is not None:
120 return subterm_synsets
[index
]
122 return None # we can't do anything w/ this
124 def __get_initial_score(self
, synset
, synonym
):
125 for i
in xrange(1, len(synonym
)):
126 score
, score_total
= self
.__get
_initial
_specific
_score
(synset
, synonym
, synonym
[i
])
130 return [0] # it's all 0s, we'll deal w/ that later
132 def __get_initial_specific_score(self
, synset
, synonym
, other_synsets
):
135 for other_synset
in other_synsets
:
136 if type(other_synset
) is not str and type(other_synset
) is not unicode:
137 raise Exception("wtf is up w/ this type %s %s (%s)" % (other_synset
, type(other_synset
), synonym
))
138 intersection_score
= self
.__get
_intersection
_score
(synset
, other_synset
)
139 score
.append(intersection_score
)
140 total_score
+= intersection_score
141 return score
, total_score
143 def __get_best_synset(self
, synonym
, k
, debug
=False):
144 subterm_synsets
= list(synonym
[k
])
146 for subterm_synset
in subterm_synsets
:
147 score
= self
.__get
_score
(subterm_synset
, synonym
, k
)
151 print subterm_synsets
154 index
= self
.__get
_best
_score
_index
(scores
)
155 if index
is not None:
156 return subterm_synsets
[index
]
158 return None # we can't do anything w/ this
160 def __get_score(self
, synset
, synonym
, k
):
161 for i
in xrange(k
-1, -1, -1):
162 score
, score_total
= self
.__get
_specific
_score
(synset
, synonym
, synonym
[i
])
166 return [0] # it's all 0s, we'll deal w/ that later
168 def __get_specific_score(self
, synset
, synonym
, other_synsets
):
171 for other_synset
in other_synsets
:
172 intersection_score
= self
.__get
_intersection
_score
(other_synset
, synset
)
173 score
.append(intersection_score
)
174 total_score
+= intersection_score
175 return score
, total_score
177 def __get_intersection_score(self
, other_synset
, synset
):
178 S1
= wn
.synset(other_synset
)
179 S2
= wn
.synset(synset
)
181 score
+= self
.__get
_colour
_score
(S1
, S2
)
182 score
+= self
.__get
_domain
_score
(S1
, S2
)
183 score
+= self
.__get
_synonymy
_score
(S1
, S2
)
184 score
+= self
.__get
_hypernymy
_meronymy
_path
_score
(S1
, S2
)
185 score
+= self
.__get
_hyponymy
_holonymy
_path
_score
(S1
, S2
)
186 score
+= self
.__get
_parallelism
_score
(S1
, S2
)
187 score
+= self
.__get
_gloss
_score
(S1
, S2
)
188 score
+= self
.__get
_topic
_score
(S1
, S2
)
189 score
+= self
.__get
_gloss
_hyperonymy
_meronymy
_path
_score
(S1
, S2
)
190 score
+= self
.__get
_gloss
_parallelism
_score
(S1
, S2
)
191 score
+= self
.__get
_gloss
_gloss
_score
(S1
, S2
)
192 score
+= self
.__get
_hyperonymy
_meronyomy
_gloss
_path
_score
(S1
, S2
)
193 score
+= self
.__get
_parallelism
_gloss
_score
(S1
, S2
)
194 #print "intersection score: %s %s %s" % (other_synset, synset, score)
197 CHROMATIC
= wn
.synset('chromatic.a.03')
198 PHYSICAL_ENTITY
= wn
.synset('physical_entity.n.01')
199 COLORS
= frozenset(CHROMATIC
.similar_tos())
201 def __get_colour_score(self
, S1
, S2
):
202 if S1
in SimpleConcepts
.COLORS
and SimpleConcepts
.PHYSICAL_ENTITY
in chain(*S2
.hypernym_paths()):
207 def __get_domain_score(self
, S1
, S2
):
209 DOMAIN_LABEL
= re
.compile('\(of ([^\)]*)\)')
210 m
= DOMAIN_LABEL
.match(S1
.definition
)
212 domain_label_synsets
= wn
.synsets(m
.group(1).replace(' ', '_'), pos
=wn
.NOUN
)
214 for domain_label_synset
in domain_label_synsets
:
215 if S2
== domain_label_synset
or S2
in domain_label_synset
.hyponyms():
220 def __glosses(self
, synset
):
221 l
= list(synset
.examples
)
222 l
.append(synset
.definition
)
223 for other
in synset
.similar_tos():
224 l
.append(other
.definition
)
225 l
.extend(other
.examples
)
228 def __get_synonymy_score(self
, S1
, S2
):
229 if S1
== S2
or S2
in chain(*[(b
.synset
for b
in a
.pertainyms()) for a
in S1
.lemmas
]):
234 def __net(self
, synset
, max_depth
=3, *relations
):
241 connections
= frozenset(chain(*[nym(synset
) for nym
in relations
]))
247 return frozenset(chain(connections
,
248 chain(*[self
.__net
(connection
, max_depth
-1, *relations
) for connection
in connections
])))
251 def __get_hypernymy_meronymy_path_score(self
, S1
, S2
):
252 net1
= self
.__net
(S1
, 3, *chain(Nyms
.HYPERNYMY
, Nyms
.MERONYMY
))
253 net2
= self
.__net
(S2
, 3, *chain(Nyms
.HYPONYMY
, Nyms
.HOLONYMY
))
254 if S1
in net2
or S2
in net1
or net1
.intersection(net2
):
259 def __get_hyponymy_holonymy_path_score(self
, S1
, S2
):
260 return self
.__get
_hypernymy
_meronymy
_path
_score
(S2
, S1
)
262 def __get_parallelism_score(self
, S1
, S2
):
263 net1
= self
.__net
(S1
, 3, *chain(Nyms
.HYPERNYMY
))
264 net2
= self
.__net
(S2
, 3, *chain(Nyms
.HYPERNYMY
))
265 if S1
in net2
or S2
in net1
or net1
.intersection(net2
):
270 def __is_a_subsequence(self
, a
, b
):
271 for j
in xrange(len(b
)-len(a
)+1):
273 for i
in xrange(len(a
)):
274 if not a
[i
] == b
[i
+j
]:
281 def __get_gloss_score(self
, S1
, S2
):
282 for lemma
in S2
.lemmas
:
283 for example
in self
.__glosses
(S1
):
284 if self
.__is
_a
_subsequence
(lemma
.name
.split('_'), example
.split(' ')):
286 for lemma
in S1
.lemmas
:
287 for example
in self
.__glosses
(S2
):
288 if self
.__is
_a
_subsequence
(lemma
.name
.split('_'), example
.split(' ')):
292 def __get_topic_score(self
, S1
, S2
):
295 def __get_gloss_hyperonymy_meronymy_path_score(self
, S1
, S2
):
296 for example
in self
.__glosses
(S1
):
297 for word
in example
.split(' '):
298 for synset
in wn
.synsets(word
):
299 if (self
.__get
_hypernymy
_meronymy
_path
_score
(synset
, S2
)
300 or self
.__get
_hyponymy
_holonymy
_path
_score
(synset
, S2
)):
304 def __get_gloss_parallelism_score(self
, S1
, S2
):
305 for example
in self
.__glosses
(S1
):
306 for word
in example
.split(' '):
307 for synset
in wn
.synsets(word
):
308 if self
.__get
_parallelism
_score
(synset
, S2
):
312 def __get_gloss_gloss_score(self
, S1
, S2
):
313 synsets_lists1
= list(chain(*((wn
.synsets(word
) for word
in example
.split(' ')) for example
in self
.__glosses
(S1
))))
314 synsets_lists2
= list(chain(*((wn
.synsets(word
) for word
in example
.split(' ')) for example
in self
.__glosses
(S2
))))
315 for synsets
in synsets_lists1
:
316 if synsets
in synsets_lists2
:
321 def __get_hyperonymy_meronyomy_gloss_path_score(self
, S1
, S2
):
322 return self
.__get
_gloss
_hyperonymy
_meronymy
_path
_score
(S2
, S1
)
324 def __get_parallelism_gloss_score(self
, S1
, S2
):
325 return self
.__get
_gloss
_parallelism
_score
(S2
, S1
)
327 def __get_best_score_index(self
, scores
):
328 best_score_index
= None
329 for i
in xrange(len(scores
)):
332 if best_score_index
is None:
333 if len(filter(lambda x
: x
!= 0, score
)) != 0:
335 elif self
.__is
_better
_score
(score
, scores
[best_score_index
]):
337 return best_score_index
339 def __is_better_score(self
, score_a
, score_b
):
340 return self
.__value
(score_a
) > self
.__value
(score_b
)
342 def __value(self
, score
):
343 # dictionary order gives weird results.
344 return sum(score
) + 3*len(filter(lambda x
: x
!= 0, score
))