terms seem to work again
[nltk_ontology_framework.git] / src / mjacob / ontologybuilder / highlighted_terms.py
blob583b589ce773f0af8aa89e23b509f2aa9b22df55
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 3, 2011
5 @author: mjacob
6 '''
8 from framework import OntologyBuilderTerms
10 from util.cache_util import get_cache, is_cached, set_cache
11 import math
13 class HighlightedTerms(OntologyBuilderTerms):
15 def __init__(self, corpus, highlights, other_corpora, tagger, chunker, chunk_filter, subterms, term_filter):
16 self.__corpus = corpus
17 self.__highlights = highlights
18 self.__other_corpora = other_corpora
19 self.__tagger = tagger
20 self.__chunker = chunker
21 self.__chunk_filter = chunk_filter
22 self.__subterms = subterms
23 self.__term_filter = term_filter
25 def _get_terms(self, **state):
26 """input: 1 domain corpus, 1 general corpus
27 identify term candidates in each (NPs that fit some statistics)
28 remove terms identified in general corpus from domain corpus"""
30 main_statistics = self.__get_statistics(self.__corpus, "main", self.__highlights)
31 other_stats = [self.__get_statistics(other_corpus, other_corpus_name, None)
32 for other_corpus, other_corpus_name in self.__other_corpora]
34 term_relevences = self.__get_term_relevences(main_statistics, other_stats)
35 term_entropies = self.__get_term_entropies(main_statistics, len(self.__corpus.files()))
37 terms = self.__term_filter(main_statistics, term_relevences, term_entropies)
39 return terms
41 def __get_term_entropies(self, statistics, doc_count):
42 cache_filename = "entropies"
43 if is_cached(self.cachedir(), self.name(), cache_filename):
44 entropies = get_cache(self.cachedir(), self.name(), cache_filename)
45 return entropies
47 entropies = {}
48 entropy_total = 0.0
49 for term in statistics:
50 prob = (1.0*len(statistics[term])) / doc_count
51 entropy = prob * math.log(1/prob, 2)
52 entropy_total += entropy
53 entropies[term] = entropy
54 for term in entropies:
55 entropies[term] /= (entropy_total *.001)
57 set_cache(self.cachedir(), self.name(), cache_filename, entropies)
59 return entropies
61 def __symbol_count(self, statistics):
62 n = 0
63 for term in statistics:
64 for file in statistics[term]:
65 n += statistics[term][file]
67 def __count(self, term, statistics, size):
68 n = 0
69 if term in statistics:
70 for file in statistics[term]:
71 n += statistics[term][file]
72 return n
75 def __get_term_relevences(self, main_statistics, other_stats):
76 cache_filename = "relevences"
77 if is_cached(self.cachedir(), self.name(), cache_filename):
78 term_relevences = get_cache(self.cachedir(), self.name(), cache_filename)
79 return term_relevences
81 term_relevences = {}
83 main_size = self.__symbol_count(main_statistics)
84 other_sizes = [self.__symbol_count(stat) for stat in other_stats]
86 for term in main_statistics:
87 numerator = self.__count(term, main_statistics, main_size)
88 denominator = sum(self.__count(term, other_stats[i], other_sizes[i]) for i in xrange(len(other_stats)))
89 term_relevences[term] = (1.0*numerator) / (numerator + denominator)
91 set_cache(self.cachedir(), self.name(), cache_filename, term_relevences)
93 return term_relevences
95 def __get_statistics(self, corpus, corpus_name, highlights):
97 print "finding terms in %s" % (corpus_name)
99 cache_filename = "%s.statistics" % (corpus_name)
100 if is_cached(self.cachedir(), self.name(), cache_filename):
101 term_statistics = get_cache(self.cachedir(), self.name(), cache_filename)
102 return term_statistics
104 term_statistics = {}
105 for file in corpus.files():
106 print "processing %s" % (file)
107 termcounts_in_file = {}
108 sentences = corpus.sents(file)
109 for sentence in sentences:
110 tagged = self.__tagger.tag(sentence)
111 chunked = self.__chunker.parse(tagged)
112 relevent_chunks = self.__chunk_filter(chunked)
113 for chunk in relevent_chunks:
114 for subterm in self.__subterms(chunk):
115 if subterm in termcounts_in_file:
116 termcounts_in_file[subterm] += 1
117 else:
118 termcounts_in_file[subterm] = 1
119 if highlights:
120 sentence_highlights = highlights.sents(file)
121 for term_sentence in sentence_highlights:
122 for term in term_sentence:
123 for subterm in self.__subterms(chunk):
124 if subterm in termcounts_in_file:
125 termcounts_in_file[subterm] += 1
126 else:
127 termcounts_in_file[subterm] = 1
129 """ now, accumulate all the statistics """
130 for term, count in termcounts_in_file.items():
131 if term in term_statistics:
132 term_statistics[term][file] = count
133 else:
134 term_statistics[term] = {file: count}
135 print "processed %s" % (file)
137 set_cache(self.cachedir(), self.name(), cache_filename, term_statistics)
138 return term_statistics