src/mjacob/ontologybuilder/simple_concepts.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 5, 2011
   4
   5 @author: mjacob
   6 '''
   7
   8 from itertools import chain
   9 from framework import OntologyBuilderConcepts, OntologyBuilderFramework
  10 import nltk
  11 from util.cached import Cached
  12 from util.memoized import Memoized
  13 wn = nltk.corpus.wordnet # this is to subvert a bug in my IDE
  14 import re
  15
  16 DOMAIN_LABEL = re.compile('\(of ([^\)]*)\)')
  17 NOMINAL_SYNSET = re.compile('\w+\.n\.\d+$')
  18
  19 class Nyms(object):
  20     """
  21     Shortcuts for referring to particular relationships in wordnet
  22
  23     HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.hypernyms}
  24     INSTANCE_HYPERNYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hypernyms}
  25
  26     HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
  27
  28     HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.hyponyms}
  29     INSTANCE_HYPONYMS = C{nltk.corpus.reader.wordnet.Synset.instance_hyponyms}
  30
  31     HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
  32
  33     MEMBER_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.member_holonyms}
  34     SUBSTANCE_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_holonyms}
  35     PART_HOLONYMS = C{nltk.corpus.reader.wordnet.Synset.part_holonyms}
  36
  37     HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
  38
  39     MEMBER_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.member_meronyms}
  40     SUBSTANCE_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.substance_meronyms}
  41     PART_MERONYMS = C{nltk.corpus.reader.wordnet.Synset.part_meronyms}
  42
  43     MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
  44
  45     ATTRIBUTES = C{nltk.corpus.reader.wordnet.Synset.attributes}
  46     ENTAILMENTS = C{nltk.corpus.reader.wordnet.Synset.entailments}
  47     CAUSES = C{nltk.corpus.reader.wordnet.Synset.causes}
  48     ALSO_SEES = C{nltk.corpus.reader.wordnet.Synset.also_sees}
  49     VERB_GROUPS = C{nltk.corpus.reader.wordnet.Synset.verb_groups}
  50     SIMILAR_TOS = C{nltk.corpus.reader.wordnet.Synset.similar_tos}
  51
  52     ALL = all of them
  53     """
  54     HYPERNYMS = nltk.corpus.reader.wordnet.Synset.hypernyms
  55     INSTANCE_HYPERNYMS = nltk.corpus.reader.wordnet.Synset.instance_hypernyms
  56
  57     HYPERNYMY = (HYPERNYMS, INSTANCE_HYPERNYMS)
  58
  59     HYPONYMS = nltk.corpus.reader.wordnet.Synset.hyponyms
  60     INSTANCE_HYPONYMS = nltk.corpus.reader.wordnet.Synset.instance_hyponyms
  61
  62     HYPONYMY = (HYPONYMS, INSTANCE_HYPONYMS)
  63
  64     MEMBER_HOLONYMS = nltk.corpus.reader.wordnet.Synset.member_holonyms
  65     SUBSTANCE_HOLONYMS = nltk.corpus.reader.wordnet.Synset.substance_holonyms
  66     PART_HOLONYMS = nltk.corpus.reader.wordnet.Synset.part_holonyms
  67
  68     HOLONYMY = (MEMBER_HOLONYMS, SUBSTANCE_HOLONYMS, PART_HOLONYMS)
  69
  70     MEMBER_MERONYMS = nltk.corpus.reader.wordnet.Synset.member_meronyms
  71     SUBSTANCE_MERONYMS = nltk.corpus.reader.wordnet.Synset.substance_meronyms
  72     PART_MERONYMS = nltk.corpus.reader.wordnet.Synset.part_meronyms
  73
  74     MERONYMY = (MEMBER_MERONYMS, SUBSTANCE_MERONYMS, PART_MERONYMS)
  75
  76     ATTRIBUTES = nltk.corpus.reader.wordnet.Synset.attributes
  77     ENTAILMENTS = nltk.corpus.reader.wordnet.Synset.entailments
  78     CAUSES = nltk.corpus.reader.wordnet.Synset.causes
  79     ALSO_SEES = nltk.corpus.reader.wordnet.Synset.also_sees
  80     VERB_GROUPS = nltk.corpus.reader.wordnet.Synset.verb_groups
  81     SIMILAR_TOS = nltk.corpus.reader.wordnet.Synset.similar_tos
  82
  83     ALL = (HYPERNYMS
  84          , INSTANCE_HYPERNYMS
  85          , HYPONYMS
  86          , INSTANCE_HYPONYMS
  87          , MEMBER_HOLONYMS
  88          , SUBSTANCE_HOLONYMS
  89          , PART_HOLONYMS
  90          , MEMBER_MERONYMS
  91          , SUBSTANCE_MERONYMS
  92          , PART_MERONYMS
  93          , ATTRIBUTES
  94          , ENTAILMENTS
  95          , CAUSES
  96          , ALSO_SEES
  97          , VERB_GROUPS
  98          , SIMILAR_TOS)
  99
 100 class SimpleConcepts(OntologyBuilderConcepts):
 101     """
 102     Generate concepts for terms out of collections of wordnet synsets.
 103
 104     The basic idea is taken from [Navigli and Velardi 2002], though it's simplified somewhat.
 105
 106     Concepts are basically generated by doing a primitive sort of lexical
 107     disambiguation, by computing scores for a subterm based upon the presence
 108     of specific connections in wordnet of a subterm and the subterm preceding it.
 109
 110     In [Navigli and Velardi 2002], the synset for the initial subterm was
 111     chosen manually, but this algorithm instead tries to match it against
 112     the subterm following it. It seems to work well enough for the purposes
 113     of demonstration.
 114     """
 115
 116     def __init__(self, debug=False):
 117         self.__debug = debug
 118
 119     @Cached(lambda *x, **y: OntologyBuilderFramework.CONCEPTS)
 120     def _get_concepts(self, **state):
 121         """
 122         given a collection of synonyms (bundles of synsets per subterm),
 123         disambiguate the synsets and return a single synsets per subterm.
 124
 125         12 specific structural relations mentioned in [Navigli and Velardi 2002]
 126         are used in the process of disambiguation.
 127         """
 128         synonyms = state[OntologyBuilderFramework.SYNONYMS]
 129
 130         concepts = {}
 131         for synonym in synonyms:
 132             if len(synonym) < 2:
 133                 continue
 134
 135             """make sure that the head of the concept is nominal"""
 136             nominal_concept = self.__ensure_nominal(synonym)
 137             if not nominal_concept[-1]:
 138                 continue # if not a nominal concept, just skip it
 139
 140             concept = self.get_concept(nominal_concept)
 141
 142             if concept:
 143                 concept = tuple(concept) # finalize it
 144                 if self.__debug:
 145                     print "%s means %s" % (synonyms[synonym], concept)
 146
 147                 if concept in concepts:
 148                     concepts[concept].update(synonyms[synonym])
 149                 else:
 150                     concepts[concept] = set(synonyms[synonym])
 151             elif self.__debug:
 152                 print "...... unmeaningful: %s" % (synonyms[synonym])
 153
 154         return concepts
 155
 156     def __ensure_nominal(self, synonym):
 157         """filters out non-nominal synsets from the head of the term"""
 158         filtered_end = filter(NOMINAL_SYNSET.match, synonym[-1])
 159         return tuple(chain(synonym[:-1], [filtered_end]))
 160
 161     def get_concept(self, synonym):
 162         """
 163         disambiguate a specific synonym.
 164         """
 165         initial_synset = self.__get_initial_synset(synonym)
 166         if not initial_synset:
 167             if self.__debug:
 168                 print "no initial synset found"
 169             return None
 170
 171         concept = [initial_synset]
 172
 173         for k in xrange(1, len(synonym)):
 174             best_synset = self.__get_best_synset(synonym, k)
 175
 176             if not best_synset:
 177                 if self.__debug:
 178                     print "no optimal synset found for %s" % (synonym[k])
 179                 return None
 180
 181             concept.append(best_synset)
 182
 183         return concept
 184
 185     def __get_initial_synset(self, synonym):
 186         """
 187         compares the synsets for the initial subterm to those coming after it
 188         to attempt to disambiguate the sense of the subterm.
 189         """
 190         subterm_synsets = list(synonym[0])
 191         scores = []
 192         for subterm_synset in subterm_synsets:
 193             score = self.__get_initial_score(subterm_synset, synonym)
 194             scores.append(score)
 195
 196         if self.__debug:
 197             print subterm_synsets
 198             print scores
 199
 200         index = self.__get_best_score_index(scores)
 201         if index is not None:
 202             return subterm_synsets[index]
 203         else:
 204             return None # we can't do anything w/ this
 205
 206     def __get_initial_score(self, synset, synonym):
 207         """
 208         get the score for a particular initial synset.
 209         If no matching synsets are found at all, returns C{[0]}
 210         """
 211         for i in xrange(1, len(synonym)):
 212             score, score_total = self.__get_initial_specific_score(synset, synonym[i])
 213             if score_total != 0:
 214                 return score
 215
 216         return [0] # it's all 0s, we'll dump the term later
 217
 218     def __get_initial_specific_score(self, synset, other_synsets):
 219         """
 220         get the initial score of a synset compared to each of a collection of other synsets
 221         """
 222         score = [self.__get_intersection_score(synset, other_synset)
 223                  for other_synset in other_synsets]
 224         return score, sum(score)
 225
 226     def __get_best_synset(self, synonym, k):
 227         """
 228         disambiguate the synsets of a subterm by looking for particular relations in wordnet
 229         between them and the the synsets of the preceding subterm.
 230
 231         returns the best synset, or None if no relationships are found.
 232         """
 233         subterm_synsets = list(synonym[k])
 234         scores = []
 235         for subterm_synset in subterm_synsets:
 236             score = self.__get_score(subterm_synset, synonym, k)
 237             scores.append(score)
 238
 239         if self.__debug:
 240             print subterm_synsets
 241             print scores
 242
 243         index = self.__get_best_score_index(scores)
 244         if index is not None:
 245             return subterm_synsets[index]
 246         else:
 247             return None # we can't do anything w/ this
 248
 249     def __get_score(self, synset, synonym, k):
 250         """
 251         get the score of a C{synset} compared to all of the synsets
 252         at subterm C{k-1}
 253
 254         if no matches are found at C{k-1}, try C{k-2} all the way to C{0}
 255         """
 256         for i in xrange(k-1, -1, -1):
 257             score, score_total = self.__get_specific_score(synset, synonym, synonym[i])
 258             if score_total != 0:
 259                 return score
 260
 261         return [0] # it's all 0s, we'll deal w/ that later
 262
 263     def __get_specific_score(self, synset, synonym, other_synsets):
 264         """
 265         get the score of a synset compared to each of a collection of other synsets
 266         """
 267         score = [self.__get_intersection_score(other_synset, synset)
 268                  for other_synset in other_synsets]
 269         return score, sum(score)
 270
 271     @Memoized
 272     def __get_intersection_score(self, other_synset, synset):
 273         """
 274         count the number of semantically meaningful relationships in wordnet
 275         as specified in [Navigli and Velardi 2002]
 276         """
 277         S1 = wn.synset(other_synset)
 278         S2 = wn.synset(synset)
 279         score = 0
 280         score += self.__get_colour_score(S1, S2)
 281         score += self.__get_domain_score(S1, S2)
 282         score += self.__get_synonymy_score(S1, S2)
 283         score += self.__get_hypernymy_meronymy_path_score(S1, S2)
 284         score += self.__get_hyponymy_holonymy_path_score(S1, S2)
 285         score += self.__get_parallelism_score(S1, S2)
 286         score += self.__get_gloss_score(S1, S2)
 287         score += self.__get_gloss_hyperonymy_meronymy_path_score(S1, S2)
 288         score += self.__get_gloss_parallelism_score(S1, S2)
 289         score += self.__get_gloss_gloss_score(S1, S2)
 290         score += self.__get_hyperonymy_meronyomy_gloss_path_score(S1, S2)
 291         score += self.__get_parallelism_gloss_score(S1, S2)
 292         #print "intersection score: %s %s %s" % (other_synset, synset, score)
 293         return score
 294
 295     CHROMATIC = wn.synset('chromatic.a.03')
 296     PHYSICAL_ENTITY = wn.synset('physical_entity.n.01')
 297     COLORS = frozenset(CHROMATIC.similar_tos())
 298
 299     def __get_colour_score(self, S1, S2):
 300         """
 301         if S1 is a color and S2 is a physical object, return 1. else return 0.
 302         """
 303         if S1 in SimpleConcepts.COLORS and SimpleConcepts.PHYSICAL_ENTITY in chain(*S2.hypernym_paths()):
 304             return 1
 305         else:
 306             return 0
 307
 308     def __get_domain_score(self, S1, S2):
 309         """
 310         if the definition of S1 is of the form that it indicates a property
 311         of something specific, see if S2 is that type of thing.
 312         """
 313         m = DOMAIN_LABEL.match(S1.definition)
 314         if m:
 315             domain_label_synsets = wn.synsets(m.group(1).replace(' ', '_'), pos=wn.NOUN)
 316             score = 0
 317             for domain_label_synset in domain_label_synsets:
 318                 if S2 == domain_label_synset or S2 in domain_label_synset.hyponyms():
 319                     score += 1
 320             return score
 321         return 0
 322
 323     @Memoized
 324     def __glosses(self, synset):
 325         """
 326         return a list of all definitions and examples of the synset,
 327         along with all definitions and examples of anything similar to it.
 328         """
 329         l = list(synset.examples)
 330         l.append(synset.definition)
 331         for other in synset.similar_tos():
 332             l.append(other.definition)
 333             l.extend(other.examples)
 334         return l
 335
 336     def __get_synonymy_score(self, S1, S2):
 337         """
 338         If S1 and S2 are synonyms, return 1. else 0.
 339         """
 340         if S1 == S2 or S2 in chain(*[(b.synset for b in a.pertainyms()) for a in S1.lemmas]):
 341             return 1
 342         else:
 343             return 0
 344
 345     @Memoized
 346     def __net(self, synset, max_depth=3, *relations):
 347         """
 348         finds all connected synsets of @param synset of the kind specified in @param relations,
 349         up to @param max_depth, which has a default of 3.
 350         """
 351         if max_depth < 1:
 352             return []
 353
 354         if not relations:
 355             relations = Nyms.ALL
 356
 357         connections = frozenset(chain(*[nym(synset) for nym in relations]))
 358
 359         if max_depth == 1:
 360             return connections
 361
 362         else:
 363             return frozenset(chain(connections,
 364                                    chain(*[self.__net(connection, max_depth-1, *relations) for connection in connections])))
 365
 366
 367     def __get_hypernymy_meronymy_path_score(self, S1, S2):
 368         """
 369         If something above S1 and something below S2 are identical,
 370         return 1, otherwise return 0.
 371         """
 372         net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY, Nyms.MERONYMY))
 373         net2 = self.__net(S2, 3, *chain(Nyms.HYPONYMY, Nyms.HOLONYMY))
 374         if S1 in net2 or S2 in net1 or net1.intersection(net2):
 375             return 1
 376         else:
 377             return 0
 378
 379     def __get_hyponymy_holonymy_path_score(self, S1, S2):
 380         """
 381         If something above S2 and something below S1 are identical,
 382         return 1, otherwise return 0.
 383         """
 384         return self.__get_hypernymy_meronymy_path_score(S2, S1)
 385
 386     def __get_parallelism_score(self, S1, S2):
 387         """
 388         If S1 and S2 share something above them, return 1, else return 0
 389         """
 390         net1 = self.__net(S1, 3, *chain(Nyms.HYPERNYMY))
 391         net2 = self.__net(S2, 3, *chain(Nyms.HYPERNYMY))
 392         if S1 in net2 or S2 in net1 or net1.intersection(net2):
 393             return 1
 394         else:
 395             return 0
 396
 397     def __is_a_subsequence(self, a, b):
 398         """
 399         returns true if @param a is a subsequence of @param b
 400         """
 401         for j in xrange(len(b)-len(a)+1):
 402             match = True
 403             for i in xrange(len(a)):
 404                 if not a[i] == b[i+j]:
 405                     match=False
 406                     break
 407             if match:
 408                 return True
 409         return False
 410
 411     def __get_gloss_score(self, S1, S2):
 412         """
 413         if a word representing S1 is mentioned in the glosses of S2 or vice versa,
 414         return 1, otherwise return 0
 415         """
 416         for lemma in S2.lemmas:
 417             for example in self.__glosses(S1):
 418                 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
 419                     return 1
 420         for lemma in S1.lemmas:
 421             for example in self.__glosses(S2):
 422                 if self.__is_a_subsequence(lemma.name.split('_'), example.split(' ')):
 423                     return 1
 424         return 0
 425
 426     def __get_gloss_hyperonymy_meronymy_path_score(self, S1, S2):
 427         """
 428         if something in the gloss of S1 is above or below S2, return 1
 429         otherwise 0
 430         """
 431         for example in self.__glosses(S1):
 432             for word in example.split(' '):
 433                 for synset in wn.synsets(word):
 434                     if (self.__get_hypernymy_meronymy_path_score(synset, S2)
 435                      or self.__get_hyponymy_holonymy_path_score(synset, S2)):
 436                         return 1
 437         return 0
 438
 439     def __get_gloss_parallelism_score(self, S1, S2):
 440         """
 441         if there is a common term above S2 and a word in the glosses of S1,
 442         return 1, otherwise 0
 443         """
 444         for example in self.__glosses(S1):
 445             for word in example.split(' '):
 446                 for synset in wn.synsets(word):
 447                     if self.__get_parallelism_score(synset, S2):
 448                         return 1
 449         return 0
 450
 451     def __get_gloss_gloss_score(self, S1, S2):
 452         """
 453         If a word in the glosses of S1 shares a synset with a word in the glosses of S2,
 454         return 1, otherwise 0.
 455         """
 456         synsets_lists1 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S1))))
 457         synsets_lists2 = list(chain(*((wn.synsets(word) for word in example.split(' ')) for example in self.__glosses(S2))))
 458         for synsets in synsets_lists1:
 459             if synsets in synsets_lists2:
 460                 return 1
 461
 462         return 0
 463
 464     def __get_hyperonymy_meronyomy_gloss_path_score(self, S1, S2):
 465         """
 466         if something in the gloss of S2 is above or below S1, return 1
 467         otherwise 0
 468         """
 469         return self.__get_gloss_hyperonymy_meronymy_path_score(S2, S1)
 470
 471     def __get_parallelism_gloss_score(self, S1, S2):
 472         """
 473         if there is a common term above S1 and a word in the glosses of S2,
 474         return 1, otherwise 0
 475         """
 476         return self.__get_gloss_parallelism_score(S2, S1)
 477
 478     def __get_best_score_index(self, scores):
 479         """
 480         given a set of scores, return the index of the "best" one
 481
 482         scores are assumed to be lists of numbers
 483         """
 484         best_score_index = None
 485         for i in xrange(len(scores)):
 486             score = scores[i]
 487             if score:
 488                 if best_score_index is None:
 489                     if len(filter(lambda x: x != 0, score)) != 0:
 490                         best_score_index = i
 491                 elif self.__is_better_score(score, scores[best_score_index]):
 492                     best_score_index = i
 493         return best_score_index
 494
 495     def __is_better_score(self, score_a, score_b):
 496         """
 497         returns true if score_a is better than score_b
 498         """
 499         return self.__value(score_a) > self.__value(score_b)
 500
 501     def __value(self, score):
 502         """
 503         [Navigli and Velardi 2002] mention some sort of "lexicographic ordering" of scores
 504         but don't specify exactly what they mean by this, and my attempts at recreating it
 505         produced incredibly weird results. Thus, I've opted for something a little simpler
 506         that seems to work well enough in practice.
 507
 508         the "value" of a score is considered to be 3* the number of non-0 subscores in it,
 509         plus the sum total of the subscores.
 510         thus, having two scores of "1" results in a total score of 8, which is equivalent
 511         to a single score of "5"
 512         """
 513         # dictionary order gives weird results.
 514         return sum(score) + 3*len(filter(lambda x: x != 0, score))