1 # This Python file uses the following encoding: utf-8
8 from framework
import OntologyBuilderTerms
10 from util
.cache_util
import get_cache
, is_cached
, set_cache
13 class HighlightedTerms(OntologyBuilderTerms
):
15 def __init__(self
, corpus
, highlights
, other_corpora
, tagger
, chunker
, chunk_filter
, subterms
, term_filter
):
16 self
.__corpus
= corpus
17 self
.__highlights
= highlights
18 self
.__other
_corpora
= other_corpora
19 self
.__tagger
= tagger
20 self
.__chunker
= chunker
21 self
.__chunk
_filter
= chunk_filter
22 self
.__subterms
= subterms
23 self
.__term
_filter
= term_filter
25 def _get_terms(self
, **state
):
26 """input: 1 domain corpus, 1 general corpus
27 identify term candidates in each (NPs that fit some statistics)
28 remove terms identified in general corpus from domain corpus"""
30 main_statistics
= self
.__get
_statistics
(self
.__corpus
, "main", self
.__highlights
)
31 other_stats
= [self
.__get
_statistics
(other_corpus
, other_corpus_name
, None)
32 for other_corpus
, other_corpus_name
in self
.__other
_corpora
]
34 term_relevences
= self
.__get
_term
_relevences
(main_statistics
, other_stats
)
35 term_entropies
= self
.__get
_term
_entropies
(main_statistics
, len(self
.__corpus
.files()))
37 terms
= self
.__term
_filter
(main_statistics
, term_relevences
, term_entropies
)
41 def __get_term_entropies(self
, statistics
, doc_count
):
42 cache_filename
= "entropies"
43 if is_cached(self
.cachedir(), self
.name(), cache_filename
):
44 entropies
= get_cache(self
.cachedir(), self
.name(), cache_filename
)
49 for term
in statistics
:
50 prob
= (1.0*len(statistics
[term
])) / doc_count
51 entropy
= prob
* math
.log(1/prob
, 2)
52 entropy_total
+= entropy
53 entropies
[term
] = entropy
54 for term
in entropies
:
55 entropies
[term
] /= (entropy_total
*.001)
57 set_cache(self
.cachedir(), self
.name(), cache_filename
, entropies
)
61 def __symbol_count(self
, statistics
):
63 for term
in statistics
:
64 for file in statistics
[term
]:
65 n
+= statistics
[term
][file]
67 def __count(self
, term
, statistics
, size
):
69 if term
in statistics
:
70 for file in statistics
[term
]:
71 n
+= statistics
[term
][file]
75 def __get_term_relevences(self
, main_statistics
, other_stats
):
76 cache_filename
= "relevences"
77 if is_cached(self
.cachedir(), self
.name(), cache_filename
):
78 term_relevences
= get_cache(self
.cachedir(), self
.name(), cache_filename
)
79 return term_relevences
83 main_size
= self
.__symbol
_count
(main_statistics
)
84 other_sizes
= [self
.__symbol
_count
(stat
) for stat
in other_stats
]
86 for term
in main_statistics
:
87 numerator
= self
.__count
(term
, main_statistics
, main_size
)
88 denominator
= sum(self
.__count
(term
, other_stats
[i
], other_sizes
[i
]) for i
in xrange(len(other_stats
)))
89 term_relevences
[term
] = (1.0*numerator
) / (numerator
+ denominator
)
91 set_cache(self
.cachedir(), self
.name(), cache_filename
, term_relevences
)
93 return term_relevences
95 def __get_statistics(self
, corpus
, corpus_name
, highlights
):
97 print "finding terms in %s" % (corpus_name
)
99 cache_filename
= "%s.statistics" % (corpus_name
)
100 if is_cached(self
.cachedir(), self
.name(), cache_filename
):
101 term_statistics
= get_cache(self
.cachedir(), self
.name(), cache_filename
)
102 return term_statistics
105 for file in corpus
.files():
106 print "processing %s" % (file)
107 termcounts_in_file
= {}
108 sentences
= corpus
.sents(file)
109 for sentence
in sentences
:
110 tagged
= self
.__tagger
.tag(sentence
)
111 chunked
= self
.__chunker
.parse(tagged
)
112 relevent_chunks
= self
.__chunk
_filter
(chunked
)
113 for chunk
in relevent_chunks
:
114 for subterm
in self
.__subterms
(chunk
):
115 if subterm
in termcounts_in_file
:
116 termcounts_in_file
[subterm
] += 1
118 termcounts_in_file
[subterm
] = 1
120 sentence_highlights
= highlights
.sents(file)
121 for term_sentence
in sentence_highlights
:
122 for term
in term_sentence
:
123 for subterm
in self
.__subterms
(chunk
):
124 if subterm
in termcounts_in_file
:
125 termcounts_in_file
[subterm
] += 1
127 termcounts_in_file
[subterm
] = 1
129 """ now, accumulate all the statistics """
130 for term
, count
in termcounts_in_file
.items():
131 if term
in term_statistics
:
132 term_statistics
[term
][file] = count
134 term_statistics
[term
] = {file: count
}
135 print "processed %s" % (file)
137 set_cache(self
.cachedir(), self
.name(), cache_filename
, term_statistics
)
138 return term_statistics