src/mjacob/ontologybuilder/simple_concepts.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 5, 2011
   4
   5 @author: mjacob
   6 '''
   7
   8 from itertools import chain
   9 from framework import OntologyBuilderConcepts, OntologyBuilderFramework
  10 import nltk
  11 wn = nltk.corpus.wordnet
  12
  13
  14 class Nyms(object):
  15     HYPERNYMS = nltk.corpus.reader.wordnet.Synset.hypernyms
  16     INSTANCE_HYPERNYMS = nltk.corpus.reader.wordnet.Synset.instance_hypernyms
  17
  18     HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
  19
  20     HYPONYMS = nltk.corpus.reader.wordnet.Synset.hyponyms
  21     INSTANCE_HYPONYMS = nltk.corpus.reader.wordnet.Synset.instance_hyponyms
  22
  23     HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
  24
  25     MEMBER_HOLONYMS = nltk.corpus.reader.wordnet.Synset.member_holonyms
  26     SUBSTANCE_HOLONYMS = nltk.corpus.reader.wordnet.Synset.substance_holonyms
  27     PART_HOLONYMS = nltk.corpus.reader.wordnet.Synset.part_holonyms
  28
  29     HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
  30
  31     MEMBER_MERONYMS = nltk.corpus.reader.wordnet.Synset.member_meronyms
  32     SUBSTANCE_MERONYMS = nltk.corpus.reader.wordnet.Synset.substance_meronyms
  33     PART_MERONYMS = nltk.corpus.reader.wordnet.Synset.part_meronyms
  34
  35     MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
  36
  37     ATTRIBUTES = nltk.corpus.reader.wordnet.Synset.attributes
  38     ENTAILMENTS = nltk.corpus.reader.wordnet.Synset.entailments
  39     CAUSES = nltk.corpus.reader.wordnet.Synset.causes
  40     ALSO_SEES = nltk.corpus.reader.wordnet.Synset.also_sees
  41     VERB_GROUPS = nltk.corpus.reader.wordnet.Synset.verb_groups
  42     SIMILAR_TOS = nltk.corpus.reader.wordnet.Synset.similar_tos
  43
  44     ALL = (HYPERNYMS
  45          , INSTANCE_HYPERNYMS
  46          , HYPONYMS
  47          , INSTANCE_HYPONYMS
  48          , MEMBER_HOLONYMS
  49          , SUBSTANCE_HOLONYMS
  50          , PART_HOLONYMS
  51          , MEMBER_MERONYMS
  52          , SUBSTANCE_MERONYMS
  53          , PART_MERONYMS
  54          , ATTRIBUTES
  55          , ENTAILMENTS
  56          , CAUSES
  57          , ALSO_SEES
  58          , VERB_GROUPS
  59          , SIMILAR_TOS)
  60
  61 class SimpleConcepts(OntologyBuilderConcepts):
  62     def _get_concepts(self, **state):
  63         synonyms = state[OntologyBuilderFramework.SYNONYMS]
  64
  65         concepts = {}
  66         for synonym in synonyms:
  67             if len(synonym) < 2:
  68                 continue
  69
  70             concept = self.get_concept(synonym)
  71
  72             if concept:
  73                 concept = tuple(concept) # finalize it
  74                 print "%s means %s" % (synonyms[synonym], concept)
  75
  76                 if concept in concepts:
  77                     concepts[concept].update(synonyms[synonym])
  78                 else:
  79                     concepts[concept] = set(synonyms[synonym])
  80             else:
  81                 print "...... unmeaningful: %s" % (synonyms[synonym])
  82
  83         return concepts
  84
  85     def get_concept(self, synonym, debug=False):
  86         initial_synset = self.__get_initial_synset(synonym, debug=debug)
  87         if not initial_synset:
  88             if debug:
  89                 print "no initial synset found"
  90             return None
  91
  92         concept = [initial_synset]
  93
  94         for k in xrange(1, len(synonym)):
  95             best_synset = self.__get_best_synset(synonym, k, debug=debug)
  96
  97             if not best_synset:
  98                 if debug:
  99                     print "no optimal synset found for %s" % (synonym[k])
 100                 return None
 101
 102             concept.append(best_synset)
 103
 104         return concept
 105
 106     def __get_initial_synset(self, synonym, debug=False):
 107         """supposedly chosen manually? we don't have that time.."""
 108         subterm_synsets = list(synonym[0])
 109         scores = []
 110         for subterm_synset in subterm_synsets:
 111             score = self.__get_initial_score(subterm_synset, synonym)
 112             scores.append(score)
 113
 114         if debug:
 115             print subterm_synsets
 116             print scores
 117
 118         index = self.__get_best_score_index(scores)
 119         if index is not None:
 120             return subterm_synsets[index]
 121         else:
 122             return None # we can't do anything w/ this
 123
 124     def __get_initial_score(self, synset, synonym):
 125         for i in xrange(1, len(synonym)):
 126             score, score_total = self.__get_initial_specific_score(synset, synonym, synonym[i])
 127             if score_total != 0:
 128                 return score
 129
 130         return [0] # it's all 0s, we'll deal w/ that later
 131
 132     def __get_initial_specific_score(self, synset, synonym, other_synsets):
 133         score = []
 134         total_score = 0
 135         for other_synset in other_synsets:
 136             if type(other_synset) is not str and type(other_synset) is not unicode:
 137                 raise Exception("wtf is up w/ this type %s %s (%s)" % (other_synset, type(other_synset), synonym))
 138             intersection_score = self.__get_intersection_score(synset, other_synset)
 139             score.append(intersection_score)
 140             total_score += intersection_score
 141         return score, total_score
 142
 143     def __get_best_synset(self, synonym, k, debug=False):
 144         subterm_synsets = list(synonym[k])
 145         scores = []
 146         for subterm_synset in subterm_synsets:
 147             score = self.__get_score(subterm_synset, synonym, k)
 148             scores.append(score)
 149
 150         if debug:
 151             print subterm_synsets
 152             print scores
 153
 154         index = self.__get_best_score_index(scores)
 155         if index is not None:
 156             return subterm_synsets[index]
 157         else:
 158             return None # we can't do anything w/ this
 159
 160     def __get_score(self, synset, synonym, k):
 161         for i in xrange(k-1, -1, -1):
 162             score, score_total = self.__get_specific_score(synset, synonym, synonym[i])
 163             if score_total != 0:
 164                 return score
 165
 166         return [0] # it's all 0s, we'll deal w/ that later
 167
 168     def __get_specific_score(self, synset, synonym, other_synsets):
 169         score = []
 170         total_score = 0
 171         for other_synset in other_synsets:
 172             intersection_score = self.__get_intersection_score(other_synset, synset)
 173             score.append(intersection_score)
 174             total_score += intersection_score
 175         return score, total_score
 176
 177     def __get_intersection_score(self, other_synset, synset):
 178         S1 = wn.synset(other_synset)
 179         S2 = wn.synset(synset)
 180         score = 0
 181         score += self.__get_colour_score(S1, S2)
 182         score += self.__get_domain_score(S1, S2)
 183         score += self.__get_synonymy_score(S1, S2)
 184         score += self.__get_hypernymy_meronymy_path_score(S1, S2)
 185         score += self.__get_hyponymy_holonymy_path_score(S1, S2)
 186         score += self.__get_parallelism_score(S1, S2)
 187         score += self.__get_gloss_score(S1, S2)
 188         score += self.__get_topic_score(S1, S2)
 189         score += self.__get_gloss_hyperonymy_meronymy_path_score(S1, S2)
 190         score += self.__get_gloss_parallelism_score(S1, S2)
 191         score += self.__get_gloss_gloss_score(S1, S2)
 192         score += self.__get_hyperonymy_meronyomy_gloss_path_score(S1, S2)
 193         score += self.__get_parallelism_gloss_score(S1, S2)
 194         #print "intersection score: %s %s %s" % (other_synset, synset, score)
 195         return score
 196
 197     CHROMATIC = wn.synset('chromatic.a.03')
 198     PHYSICAL_ENTITY = wn.synset('physical_entity.n.01')
 199     COLORS = frozenset(CHROMATIC.similar_tos())
 200
 201     def __get_colour_score(self, S1, S2):
 202         if S1 in SimpleConcepts.COLORS and SimpleConcepts.PHYSICAL_ENTITY in chain(*S2.hypernym_paths()):
 203             return 1
 204         else:
 205             return 0
 206
 207     def __get_domain_score(self, S1, S2):
 208         import re
 209         DOMAIN_LABEL = re.compile('\(of ([^\)]*)\)')
 210         m = DOMAIN_LABEL.match(S1.definition)
 211         if m:
 212             domain_label_synsets = wn.synsets(m.group(1).replace(' ', '_'), pos=wn.NOUN)
 213             score = 0
 214             for domain_label_synset in domain_label_synsets:
 215                 if S2 == domain_label_synset or S2 in domain_label_synset.hyponyms():
 216                     score += 1
 217             return score
 218         return 0
 219
 220     def __glosses(self, synset):
 221         l = list(synset.examples)
 222         l.append(synset.definition)
 223         for other in synset.similar_tos():
 224             l.append(other.definition)
 225             l.extend(other.examples)
 226         return l
 227
 228     def __get_synonymy_score(self, S1, S2):
 229         if S1 == S2 or S2 in chain(*[(b.synset for b in a.pertainyms()) for a in S1.lemmas]):
 230             return 1
 231         else:
 232             return 0
 233
 234     def __net(self, synset, max_depth=3, *relations):
 235         if max_depth < 1:
 236             return []
 237
 238         if not relations:
 239             relations = Nyms.ALL
 240
 241         connections = frozenset(chain(*[nym(synset) for nym in relations]))
 242
 243         if max_depth == 1:
 244             return connections
 245
 246         else:
 247             return frozenset(chain(connections,
 248                                    chain(*[self.__net(connection, max_depth-1, *relations) for connection in connections])))
 249
 250
 251     def __get_hypernymy_meronymy_path_score(self, S1, S2):
 252         net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY, Nyms.MERONYMY))
 253         net2 = self.__net(S2, 3, *chain(Nyms.HYPONYMY, Nyms.HOLONYMY))
 254         if S1 in net2 or S2 in net1 or net1.intersection(net2):
 255             return 1
 256         else:
 257             return 0
 258
 259     def __get_hyponymy_holonymy_path_score(self, S1, S2):
 260         return self.__get_hypernymy_meronymy_path_score(S2, S1)
 261
 262     def __get_parallelism_score(self, S1, S2):
 263         net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY))
 264         net2 = self.__net(S2, 3, *chain(Nyms.HYPERNYMY))
 265         if S1 in net2 or S2 in net1 or net1.intersection(net2):
 266             return 1
 267         else:
 268             return 0
 269
 270     def __is_a_subsequence(self, a, b):
 271         for j in xrange(len(b)-len(a)+1):
 272             match = True
 273             for i in xrange(len(a)):
 274                 if not a[i] == b[i+j]:
 275                     match=False
 276                     break
 277             if match:
 278                 return True
 279         return False
 280
 281     def __get_gloss_score(self, S1, S2):
 282         for lemma in S2.lemmas:
 283             for example in self.__glosses(S1):
 284                 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
 285                     return 1
 286         for lemma in S1.lemmas:
 287             for example in self.__glosses(S2):
 288                 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
 289                     return 1
 290         return 0
 291
 292     def __get_topic_score(self, S1, S2):
 293         return 0
 294
 295     def __get_gloss_hyperonymy_meronymy_path_score(self, S1, S2):
 296         for example in self.__glosses(S1):
 297             for word in example.split(' '):
 298                 for synset in wn.synsets(word):
 299                     if (self.__get_hypernymy_meronymy_path_score(synset, S2)
 300                      or self.__get_hyponymy_holonymy_path_score(synset, S2)):
 301                         return 1
 302         return 0
 303
 304     def __get_gloss_parallelism_score(self, S1, S2):
 305         for example in self.__glosses(S1):
 306             for word in example.split(' '):
 307                 for synset in wn.synsets(word):
 308                     if self.__get_parallelism_score(synset, S2):
 309                         return 1
 310         return 0
 311
 312     def __get_gloss_gloss_score(self, S1, S2):
 313         synsets_lists1 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S1))))
 314         synsets_lists2 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S2))))
 315         for synsets in synsets_lists1:
 316             if synsets in synsets_lists2:
 317                 return 1
 318
 319         return 0
 320
 321     def __get_hyperonymy_meronyomy_gloss_path_score(self, S1, S2):
 322         return self.__get_gloss_hyperonymy_meronymy_path_score(S2, S1)
 323
 324     def __get_parallelism_gloss_score(self, S1, S2):
 325         return self.__get_gloss_parallelism_score(S2, S1)
 326
 327     def __get_best_score_index(self, scores):
 328         best_score_index = None
 329         for i in xrange(len(scores)):
 330             score = scores[i]
 331             if score:
 332                 if best_score_index is None:
 333                     if len(filter(lambda x: x != 0, score)) != 0:
 334                         best_score_index = i
 335                 elif self.__is_better_score(score, scores[best_score_index]):
 336                     best_score_index = i
 337         return best_score_index
 338
 339     def __is_better_score(self, score_a, score_b):
 340         return self.__value(score_a) > self.__value(score_b)
 341
 342     def __value(self, score):
 343         # dictionary order gives weird results.
 344         return sum(score) + 3*len(filter(lambda x: x != 0, score))