src/mjacob/ontologybuilder/highlighted_terms.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 3, 2011
   4
   5 @author: mjacob
   6 '''
   7
   8 from framework import OntologyBuilderTerms
   9
  10 from util.cache_util import get_cache, is_cached, set_cache
  11 import math
  12
  13 class HighlightedTerms(OntologyBuilderTerms):
  14
  15     def __init__(self, corpus, highlights, other_corpora, tagger, chunker, chunk_filter, subterms, term_filter):
  16         self.__corpus = corpus
  17         self.__highlights = highlights
  18         self.__other_corpora = other_corpora
  19         self.__tagger = tagger
  20         self.__chunker = chunker
  21         self.__chunk_filter = chunk_filter
  22         self.__subterms = subterms
  23         self.__term_filter = term_filter
  24
  25     def _get_terms(self, **state):
  26         """input: 1 domain corpus, 1 general corpus
  27         identify term candidates in each (NPs that fit some statistics)
  28         remove terms identified in general corpus from domain corpus"""
  29
  30         main_statistics = self.__get_statistics(self.__corpus, "main", self.__highlights)
  31         other_stats = [self.__get_statistics(other_corpus, other_corpus_name, None)
  32                        for other_corpus, other_corpus_name in self.__other_corpora]
  33
  34         term_relevences = self.__get_term_relevences(main_statistics, other_stats)
  35         term_entropies = self.__get_term_entropies(main_statistics, len(self.__corpus.files()))
  36
  37         terms = self.__term_filter(main_statistics, term_relevences, term_entropies)
  38
  39         return terms
  40
  41     def __get_term_entropies(self, statistics, doc_count):
  42         cache_filename = "entropies"
  43         if is_cached(self.cachedir(), self.name(), cache_filename):
  44             entropies = get_cache(self.cachedir(), self.name(), cache_filename)
  45             return entropies
  46
  47         entropies = {}
  48         entropy_total = 0.0
  49         for term in statistics:
  50             prob = (1.0*len(statistics[term])) / doc_count
  51             entropy = prob * math.log(1/prob, 2)
  52             entropy_total += entropy
  53             entropies[term] = entropy
  54         for term in entropies:
  55             entropies[term] /= (entropy_total *.001)
  56
  57         set_cache(self.cachedir(), self.name(), cache_filename, entropies)
  58
  59         return entropies
  60
  61     def __symbol_count(self, statistics):
  62         n = 0
  63         for term in statistics:
  64             for file in statistics[term]:
  65                 n += statistics[term][file]
  66
  67     def __count(self, term, statistics, size):
  68         n = 0
  69         if term in statistics:
  70             for file in statistics[term]:
  71                 n += statistics[term][file]
  72         return n
  73
  74
  75     def __get_term_relevences(self, main_statistics, other_stats):
  76         cache_filename = "relevences"
  77         if is_cached(self.cachedir(), self.name(), cache_filename):
  78             term_relevences = get_cache(self.cachedir(), self.name(), cache_filename)
  79             return term_relevences
  80
  81         term_relevences = {}
  82
  83         main_size = self.__symbol_count(main_statistics)
  84         other_sizes = [self.__symbol_count(stat) for stat in other_stats]
  85
  86         for term in main_statistics:
  87             numerator = self.__count(term, main_statistics, main_size)
  88             denominator = sum(self.__count(term, other_stats[i], other_sizes[i]) for i in xrange(len(other_stats)))
  89             term_relevences[term] = (1.0*numerator) / (numerator + denominator)
  90
  91         set_cache(self.cachedir(), self.name(), cache_filename, term_relevences)
  92
  93         return term_relevences
  94
  95     def __get_statistics(self, corpus, corpus_name, highlights):
  96
  97         print "finding terms in %s" % (corpus_name)
  98
  99         cache_filename = "%s.statistics" % (corpus_name)
 100         if is_cached(self.cachedir(), self.name(), cache_filename):
 101             term_statistics = get_cache(self.cachedir(), self.name(), cache_filename)
 102             return term_statistics
 103
 104         term_statistics = {}
 105         for file in corpus.files():
 106             print "processing %s" % (file)
 107             termcounts_in_file = {}
 108             sentences = corpus.sents(file)
 109             for sentence in sentences:
 110                 tagged = self.__tagger.tag(sentence)
 111                 chunked = self.__chunker.parse(tagged)
 112                 relevent_chunks = self.__chunk_filter(chunked)
 113                 for chunk in relevent_chunks:
 114                     for subterm in self.__subterms(chunk):
 115                         if subterm in termcounts_in_file:
 116                             termcounts_in_file[subterm] += 1
 117                         else:
 118                             termcounts_in_file[subterm] = 1
 119             if highlights:
 120                 sentence_highlights = highlights.sents(file)
 121                 for term_sentence in sentence_highlights:
 122                     for term in term_sentence:
 123                         for subterm in self.__subterms(chunk):
 124                             if subterm in termcounts_in_file:
 125                                 termcounts_in_file[subterm] += 1
 126                             else:
 127                                 termcounts_in_file[subterm] = 1
 128
 129             """ now, accumulate all the statistics """
 130             for term, count in termcounts_in_file.items():
 131                 if term in term_statistics:
 132                     term_statistics[term][file] = count
 133                 else:
 134                     term_statistics[term] = {file: count}
 135             print "processed %s" % (file)
 136
 137         set_cache(self.cachedir(), self.name(), cache_filename, term_statistics)
 138         return term_statistics
 139
 140