src/mjacob/ontologybuilder/simple_concepts.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 5, 2011
   4
   5 @author: mjacob
   6 '''
   7
   8 from itertools import chain
   9 from framework import OntologyBuilderConcepts, OntologyBuilderFramework
  10 import nltk
  11 from util.cached import Cached
  12 from util.memoized import Memoized
  13 wn = nltk.corpus.wordnet # this is to subvert a bug in my IDE
  14 import re
  15
  16 DOMAIN_LABEL = re.compile('\(of ([^\)]*)\)')
  17 NOMINAL_SYNSET = re.compile('\w+\.n\.\d+$')
  18
  19 class Nyms(object):
  20     """
  21     Shortcuts for referring to particular relationships in wordnet
  22
  23     HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.hypernyms}
  24     INSTANCE_HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hypernyms}
  25
  26     HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
  27
  28     HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.hyponyms}
  29     INSTANCE_HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hyponyms}
  30
  31     HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
  32
  33     MEMBER_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.member_holonyms}
  34     SUBSTANCE_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_holonyms}
  35     PART_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.part_holonyms}
  36
  37     HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
  38
  39     MEMBER_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.member_meronyms}
  40     SUBSTANCE_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_meronyms}
  41     PART_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.part_meronyms}
  42
  43     MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
  44
  45     ATTRIBUTES = C{nltk.corpus.reader.wordnet.Synset.attributes}
  46     ENTAILMENTS = C{nltk.corpus.reader.wordnet.Synset.entailments}
  47     CAUSES = C{nltk.corpus.reader.wordnet.Synset.causes}
  48     ALSO_SEES = C{nltk.corpus.reader.wordnet.Synset.also_sees}
  49     VERB_GROUPS = C{nltk.corpus.reader.wordnet.Synset.verb_groups}
  50     SIMILAR_TOS = C{nltk.corpus.reader.wordnet.Synset.similar_tos}
  51
  52     ALL = all of them
  53     """
  54     HYPERNYMS = nltk.corpus.reader.wordnet.Synset.hypernyms
  55     INSTANCE_HYPERNYMS = nltk.corpus.reader.wordnet.Synset.instance_hypernyms
  56
  57     HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
  58
  59     HYPONYMS = nltk.corpus.reader.wordnet.Synset.hyponyms
  60     INSTANCE_HYPONYMS = nltk.corpus.reader.wordnet.Synset.instance_hyponyms
  61
  62     HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
  63
  64     MEMBER_HOLONYMS = nltk.corpus.reader.wordnet.Synset.member_holonyms
  65     SUBSTANCE_HOLONYMS = nltk.corpus.reader.wordnet.Synset.substance_holonyms
  66     PART_HOLONYMS = nltk.corpus.reader.wordnet.Synset.part_holonyms
  67
  68     HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
  69
  70     MEMBER_MERONYMS = nltk.corpus.reader.wordnet.Synset.member_meronyms
  71     SUBSTANCE_MERONYMS = nltk.corpus.reader.wordnet.Synset.substance_meronyms
  72     PART_MERONYMS = nltk.corpus.reader.wordnet.Synset.part_meronyms
  73
  74     MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
  75
  76     ATTRIBUTES = nltk.corpus.reader.wordnet.Synset.attributes
  77     ENTAILMENTS = nltk.corpus.reader.wordnet.Synset.entailments
  78     CAUSES = nltk.corpus.reader.wordnet.Synset.causes
  79     ALSO_SEES = nltk.corpus.reader.wordnet.Synset.also_sees
  80     VERB_GROUPS = nltk.corpus.reader.wordnet.Synset.verb_groups
  81     SIMILAR_TOS = nltk.corpus.reader.wordnet.Synset.similar_tos
  82
  83     ALL = (HYPERNYMS
  84          , INSTANCE_HYPERNYMS
  85          , HYPONYMS
  86          , INSTANCE_HYPONYMS
  87          , MEMBER_HOLONYMS
  88          , SUBSTANCE_HOLONYMS
  89          , PART_HOLONYMS
  90          , MEMBER_MERONYMS
  91          , SUBSTANCE_MERONYMS
  92          , PART_MERONYMS
  93          , ATTRIBUTES
  94          , ENTAILMENTS
  95          , CAUSES
  96          , ALSO_SEES
  97          , VERB_GROUPS
  98          , SIMILAR_TOS)
  99
 100 class SimpleConcepts(OntologyBuilderConcepts):
 101     """
 102     Generate concepts for terms out of collections of wordnet synsets.
 103
 104     The basic idea is taken from [Navigli and Velardi 2002], though it's simplified somewhat.
 105
 106     Concepts are basically generated by doing a primitive sort of lexical
 107     disambiguation, by computing scores for a subterm based upon the presence
 108     of specific connections in wordnet of a subterm and the subterm preceding it.
 109
 110     In [Navigli and Velardi 2002], the synset for the initial subterm was
 111     chosen manually, but this algorithm instead tries to match it against
 112     the subterm following it. It seems to work well enough for the purposes
 113     of demonstration.
 114     """
 115
 116     @Cached(lambda *x, **y: OntologyBuilderFramework.CONCEPTS)
 117     def _get_concepts(self, **state):
 118         """
 119         given a collection of synonyms (bundles of synsets per subterm),
 120         disambiguate the synsets and return a single synsets per subterm.
 121
 122         12 specific structural relations mentioned in [Navigli and Velardi 2002]
 123         are used in the process of disambiguation.
 124         """
 125         synonyms = state[OntologyBuilderFramework.SYNONYMS]
 126
 127         concepts = {}
 128         for synonym in synonyms:
 129             if len(synonym) < 2:
 130                 continue
 131
 132             """make sure that the head of the concept is nominal"""
 133             nominal_concept = self.__ensure_nominal(synonym)
 134             if not nominal_concept[-1]:
 135                 continue # if not a nominal concept, just skip it
 136
 137             concept = self.get_concept(nominal_concept)
 138
 139             if concept:
 140                 concept = tuple(concept) # finalize it
 141                 print "%s means %s" % (synonyms[synonym], concept)
 142
 143                 if concept in concepts:
 144                     concepts[concept].update(synonyms[synonym])
 145                 else:
 146                     concepts[concept] = set(synonyms[synonym])
 147             else:
 148                 print "...... unmeaningful: %s" % (synonyms[synonym])
 149
 150         return concepts
 151
 152     def __ensure_nominal(self, synonym):
 153         """filters out non-nominal synsets from the head of the term"""
 154         filtered_end = filter(NOMINAL_SYNSET.match, synonym[-1])
 155         return tuple(chain(synonym[:-1], [filtered_end]))
 156
 157     def get_concept(self, synonym, debug=False):
 158         """
 159         disambiguate a specific synonym.
 160         """
 161         initial_synset = self.__get_initial_synset(synonym, debug=debug)
 162         if not initial_synset:
 163             if debug:
 164                 print "no initial synset found"
 165             return None
 166
 167         concept = [initial_synset]
 168
 169         for k in xrange(1, len(synonym)):
 170             best_synset = self.__get_best_synset(synonym, k, debug=debug)
 171
 172             if not best_synset:
 173                 if debug:
 174                     print "no optimal synset found for %s" % (synonym[k])
 175                 return None
 176
 177             concept.append(best_synset)
 178
 179         return concept
 180
 181     def __get_initial_synset(self, synonym, debug=False):
 182         """
 183         compares the synsets for the initial subterm to those coming after it
 184         to attempt to disambiguate the sense of the subterm.
 185         """
 186         subterm_synsets = list(synonym[0])
 187         scores = []
 188         for subterm_synset in subterm_synsets:
 189             score = self.__get_initial_score(subterm_synset, synonym)
 190             scores.append(score)
 191
 192         if debug:
 193             print subterm_synsets
 194             print scores
 195
 196         index = self.__get_best_score_index(scores)
 197         if index is not None:
 198             return subterm_synsets[index]
 199         else:
 200             return None # we can't do anything w/ this
 201
 202     def __get_initial_score(self, synset, synonym):
 203         """
 204         get the score for a particular initial synset.
 205         If no matching synsets are found at all, returns C{[0]}
 206         """
 207         for i in xrange(1, len(synonym)):
 208             score, score_total = self.__get_initial_specific_score(synset, synonym[i])
 209             if score_total != 0:
 210                 return score
 211
 212         return [0] # it's all 0s, we'll dump the term later
 213
 214     def __get_initial_specific_score(self, synset, other_synsets):
 215         """
 216         get the initial score of a synset compared to each of a collection of other synsets
 217         """
 218         score = [self.__get_intersection_score(synset, other_synset)
 219                  for other_synset in other_synsets]
 220         return score, sum(score)
 221
 222     def __get_best_synset(self, synonym, k, debug=False):
 223         """
 224         disambiguate the synsets of a subterm by looking for particular relations in wordnet
 225         between them and the the synsets of the preceding subterm.
 226
 227         returns the best synset, or None if no relationships are found.
 228         """
 229         subterm_synsets = list(synonym[k])
 230         scores = []
 231         for subterm_synset in subterm_synsets:
 232             score = self.__get_score(subterm_synset, synonym, k)
 233             scores.append(score)
 234
 235         if debug:
 236             print subterm_synsets
 237             print scores
 238
 239         index = self.__get_best_score_index(scores)
 240         if index is not None:
 241             return subterm_synsets[index]
 242         else:
 243             return None # we can't do anything w/ this
 244
 245     def __get_score(self, synset, synonym, k):
 246         """
 247         get the score of a C{synset} compared to all of the synsets
 248         at subterm C{k-1}
 249
 250         if no matches are found at C{k-1}, try C{k-2} all the way to C{0}
 251         """
 252         for i in xrange(k-1, -1, -1):
 253             score, score_total = self.__get_specific_score(synset, synonym, synonym[i])
 254             if score_total != 0:
 255                 return score
 256
 257         return [0] # it's all 0s, we'll deal w/ that later
 258
 259     def __get_specific_score(self, synset, synonym, other_synsets):
 260         """
 261         get the score of a synset compared to each of a collection of other synsets
 262         """
 263         score = [self.__get_intersection_score(other_synset, synset)
 264                  for other_synset in other_synsets]
 265         return score, sum(score)
 266
 267     @Memoized
 268     def __get_intersection_score(self, other_synset, synset):
 269         """
 270         count the number of semantically meaningful relationships in wordnet
 271         as specified in [Navigli and Velardi 2002]
 272         """
 273         S1 = wn.synset(other_synset)
 274         S2 = wn.synset(synset)
 275         score = 0
 276         score += self.__get_colour_score(S1, S2)
 277         score += self.__get_domain_score(S1, S2)
 278         score += self.__get_synonymy_score(S1, S2)
 279         score += self.__get_hypernymy_meronymy_path_score(S1, S2)
 280         score += self.__get_hyponymy_holonymy_path_score(S1, S2)
 281         score += self.__get_parallelism_score(S1, S2)
 282         score += self.__get_gloss_score(S1, S2)
 283         score += self.__get_gloss_hyperonymy_meronymy_path_score(S1, S2)
 284         score += self.__get_gloss_parallelism_score(S1, S2)
 285         score += self.__get_gloss_gloss_score(S1, S2)
 286         score += self.__get_hyperonymy_meronyomy_gloss_path_score(S1, S2)
 287         score += self.__get_parallelism_gloss_score(S1, S2)
 288         #print "intersection score: %s %s %s" % (other_synset, synset, score)
 289         return score
 290
 291     CHROMATIC = wn.synset('chromatic.a.03')
 292     PHYSICAL_ENTITY = wn.synset('physical_entity.n.01')
 293     COLORS = frozenset(CHROMATIC.similar_tos())
 294
 295     def __get_colour_score(self, S1, S2):
 296         """
 297         if S1 is a color and S2 is a physical object, return 1. else return 0.
 298         """
 299         if S1 in SimpleConcepts.COLORS and SimpleConcepts.PHYSICAL_ENTITY in chain(*S2.hypernym_paths()):
 300             return 1
 301         else:
 302             return 0
 303
 304     def __get_domain_score(self, S1, S2):
 305         """
 306         if the definition of S1 is of the form that it indicates a property
 307         of something specific, see if S2 is that type of thing.
 308         """
 309         m = DOMAIN_LABEL.match(S1.definition)
 310         if m:
 311             domain_label_synsets = wn.synsets(m.group(1).replace(' ', '_'), pos=wn.NOUN)
 312             score = 0
 313             for domain_label_synset in domain_label_synsets:
 314                 if S2 == domain_label_synset or S2 in domain_label_synset.hyponyms():
 315                     score += 1
 316             return score
 317         return 0
 318
 319     @Memoized
 320     def __glosses(self, synset):
 321         """
 322         return a list of all definitions and examples of the synset,
 323         along with all definitions and examples of anything similar to it.
 324         """
 325         l = list(synset.examples)
 326         l.append(synset.definition)
 327         for other in synset.similar_tos():
 328             l.append(other.definition)
 329             l.extend(other.examples)
 330         return l
 331
 332     def __get_synonymy_score(self, S1, S2):
 333         """
 334         If S1 and S2 are synonyms, return 1. else 0.
 335         """
 336         if S1 == S2 or S2 in chain(*[(b.synset for b in a.pertainyms()) for a in S1.lemmas]):
 337             return 1
 338         else:
 339             return 0
 340
 341     @Memoized
 342     def __net(self, synset, max_depth=3, *relations):
 343         """
 344         finds all connected synsets of @param synset of the kind specified in @param relations,
 345         up to @param max_depth, which has a default of 3.
 346         """
 347         if max_depth < 1:
 348             return []
 349
 350         if not relations:
 351             relations = Nyms.ALL
 352
 353         connections = frozenset(chain(*[nym(synset) for nym in relations]))
 354
 355         if max_depth == 1:
 356             return connections
 357
 358         else:
 359             return frozenset(chain(connections,
 360                                    chain(*[self.__net(connection, max_depth-1, *relations) for connection in connections])))
 361
 362
 363     def __get_hypernymy_meronymy_path_score(self, S1, S2):
 364         """
 365         If something above S1 and something below S2 are identical,
 366         return 1, otherwise return 0.
 367         """
 368         net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY, Nyms.MERONYMY))
 369         net2 = self.__net(S2, 3, *chain(Nyms.HYPONYMY, Nyms.HOLONYMY))
 370         if S1 in net2 or S2 in net1 or net1.intersection(net2):
 371             return 1
 372         else:
 373             return 0
 374
 375     def __get_hyponymy_holonymy_path_score(self, S1, S2):
 376         """
 377         If something above S2 and something below S1 are identical,
 378         return 1, otherwise return 0.
 379         """
 380         return self.__get_hypernymy_meronymy_path_score(S2, S1)
 381
 382     def __get_parallelism_score(self, S1, S2):
 383         """
 384         If S1 and S2 share something above them, return 1, else return 0
 385         """
 386         net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY))
 387         net2 = self.__net(S2, 3, *chain(Nyms.HYPERNYMY))
 388         if S1 in net2 or S2 in net1 or net1.intersection(net2):
 389             return 1
 390         else:
 391             return 0
 392
 393     def __is_a_subsequence(self, a, b):
 394         """
 395         returns true if @param a is a subsequence of @param b
 396         """
 397         for j in xrange(len(b)-len(a)+1):
 398             match = True
 399             for i in xrange(len(a)):
 400                 if not a[i] == b[i+j]:
 401                     match=False
 402                     break
 403             if match:
 404                 return True
 405         return False
 406
 407     def __get_gloss_score(self, S1, S2):
 408         """
 409         if a word representing S1 is mentioned in the glosses of S2 or vice versa,
 410         return 1, otherwise return 0
 411         """
 412         for lemma in S2.lemmas:
 413             for example in self.__glosses(S1):
 414                 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
 415                     return 1
 416         for lemma in S1.lemmas:
 417             for example in self.__glosses(S2):
 418                 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
 419                     return 1
 420         return 0
 421
 422     def __get_gloss_hyperonymy_meronymy_path_score(self, S1, S2):
 423         """
 424         if something in the gloss of S1 is above or below S2, return 1
 425         otherwise 0
 426         """
 427         for example in self.__glosses(S1):
 428             for word in example.split(' '):
 429                 for synset in wn.synsets(word):
 430                     if (self.__get_hypernymy_meronymy_path_score(synset, S2)
 431                      or self.__get_hyponymy_holonymy_path_score(synset, S2)):
 432                         return 1
 433         return 0
 434
 435     def __get_gloss_parallelism_score(self, S1, S2):
 436         """
 437         if there is a common term above S2 and a word in the glosses of S1,
 438         return 1, otherwise 0
 439         """
 440         for example in self.__glosses(S1):
 441             for word in example.split(' '):
 442                 for synset in wn.synsets(word):
 443                     if self.__get_parallelism_score(synset, S2):
 444                         return 1
 445         return 0
 446
 447     def __get_gloss_gloss_score(self, S1, S2):
 448         """
 449         If a word in the glosses of S1 shares a synset with a word in the glosses of S2,
 450         return 1, otherwise 0.
 451         """
 452         synsets_lists1 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S1))))
 453         synsets_lists2 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S2))))
 454         for synsets in synsets_lists1:
 455             if synsets in synsets_lists2:
 456                 return 1
 457
 458         return 0
 459
 460     def __get_hyperonymy_meronyomy_gloss_path_score(self, S1, S2):
 461         """
 462         if something in the gloss of S2 is above or below S1, return 1
 463         otherwise 0
 464         """
 465         return self.__get_gloss_hyperonymy_meronymy_path_score(S2, S1)
 466
 467     def __get_parallelism_gloss_score(self, S1, S2):
 468         """
 469         if there is a common term above S1 and a word in the glosses of S2,
 470         return 1, otherwise 0
 471         """
 472         return self.__get_gloss_parallelism_score(S2, S1)
 473
 474     def __get_best_score_index(self, scores):
 475         """
 476         given a set of scores, return the index of the "best" one
 477
 478         scores are assumed to be lists of numbers
 479         """
 480         best_score_index = None
 481         for i in xrange(len(scores)):
 482             score = scores[i]
 483             if score:
 484                 if best_score_index is None:
 485                     if len(filter(lambda x: x != 0, score)) != 0:
 486                         best_score_index = i
 487                 elif self.__is_better_score(score, scores[best_score_index]):
 488                     best_score_index = i
 489         return best_score_index
 490
 491     def __is_better_score(self, score_a, score_b):
 492         """
 493         returns true if score_a is better than score_b
 494         """
 495         return self.__value(score_a) > self.__value(score_b)
 496
 497     def __value(self, score):
 498         """
 499         [Navigli and Velardi 2002] mention some sort of "lexicographic ordering" of scores
 500         but don't specify exactly what they mean by this, and my attempts at recreating it
 501         produced incredibly weird results. Thus, I've opted for something a little simpler
 502         that seems to work well enough in practice.
 503
 504         the "value" of a score is considered to be 3* the number of non-0 subscores in it,
 505         plus the sum total of the subscores.
 506         thus, having two scores of "1" results in a total score of 8, which is equivalent
 507         to a single score of "5"
 508         """
 509         # dictionary order gives weird results.
 510         return sum(score) + 3*len(filter(lambda x: x != 0, score))