src/mjacob/ontologybuilder/simple_wordnet_synonyms.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 5, 2011
   4
   5 @author: mjacob
   6 '''
   7 from framework import OntologyBuilderFramework, OntologyBuilderSynonyms
   8 import nltk
   9 from util.cached import Cached
  10 wn = nltk.corpus.wordnet
  11 wn.synsets # this is to get rid of stupid errors in my IDE due to nltk's lazy loading stuff
  12 from itertools import chain
  13 import re
  14
  15 POSSESIVE = re.compile(r"(?:'s|')")
  16 DETERMINERS = set(('the', 'The', 'a', 'as', 'A', 'that', 'That', 'this', 'This'))
  17 PREPOSITIONS = set(('to', 'To', 'in', 'In', 'on', 'On'))
  18
  19 EMPTY_SYNSET = frozenset()
  20
  21 class SimpleWordnetSynonyms(OntologyBuilderSynonyms):
  22     """
  23     Constructs synonyms from terms.
  24     Two terms are deemed synonyms if the synsets for their component words are identical
  25     """
  26
  27     @Cached(lambda *x, **y: OntologyBuilderFramework.SYNONYMS)
  28     def _get_synonyms(self, **state):
  29         """
  30         constructs a dictionary of synonyms to terms, where a synonym
  31         is defined as a list of wordnet synsets of the words in the term.
  32
  33         As an ad-hoc decision, it strips out some determiners,
  34         and flips everything around instances of the word "of"
  35         """
  36         terms = state[OntologyBuilderFramework.TERMS]
  37         synonyms = {}
  38         for term in terms:
  39             subterms = map(lambda x: POSSESIVE.sub("", x),
  40                            filter(lambda x: x not in DETERMINERS,
  41                                   term.split(' ')))
  42             synset_list = tuple(frozenset(synset.name for synset in wn.synsets(subterm, pos=wn.NOUN+wn.ADJ)) for subterm in subterms)
  43
  44             if 'of' in subterms:
  45                 indices = self.__get_indices('of', subterms)
  46                 phrases = self.__phrasify(subterms, indices)
  47                 synset_list = tuple(chain(*(synset_list[a[0]:a[1]] for a in reversed(phrases))))
  48
  49             if EMPTY_SYNSET in synset_list:
  50                 continue # just ignore anything w/out a synset
  51
  52             if synset_list in synonyms:
  53                 synonyms[synset_list].append(term)
  54             else:
  55                 synonyms[synset_list] = [term]
  56
  57         return synonyms
  58
  59     def __get_indices(self, x, ys):
  60         """return indicies in ys where x occurs"""
  61         return filter(lambda i: ys[i] == x, xrange(len(ys)))
  62
  63     def __phrasify(self, subterms, indices):
  64         """
  65         assuming that "indices" is a list function words, returns a list of
  66         subphrases that contain actual lexical items
  67         """
  68         phrases = []
  69         start = 0
  70         for index in indices:
  71             stop = index
  72             if start < stop:
  73                 phrases.append((start, stop))
  74             start = stop + 1
  75         phrases.append((start, len(subterms)))
  76         return phrases
  77
  78