1 // Helper class with number-of-occurences-in-corpus for a linguistic construct
2 // Copyright © 2009 The University of Chicago
6 namespace linguistica
{
12 /// It is useful to remember how many times each morpheme, phoneme,
13 /// morphological signature, and part of speech occurs in the corpus
16 /// * The information content of an instance of that construct is
17 /// -log2(corpus count of construct /
18 /// corpus count of genre of construct)
19 /// [So, for example, the encoding length of "hello" is
20 /// -log2(# of appearances of "hello" / total # of words in corpus)]
22 /// * When displaying results, it is convenient to sort by corpus count.
24 /// * If the corpus count is very low, we can skip some costly operations
25 /// on a construct and be reasonably sure we are not introducing too
28 /// This class maintains a corpus count for a linguistic construct and
29 /// provides methods for accessing and modifying it.
31 /// The count is a "token count" (total number of appearances), not
32 /// "use count" (number of distinct contexts in which this appears).
33 class linguistica::corpus_count
{
34 /// Always nonnegative.
37 /// thrown if corpus count drops below zero
40 void check_underflow()
42 static bool check_enabled
= false;
44 if (m_corpus_count
< 0) {
49 // XXX. This shouldn't happen.
50 cerr
<< "corpus count underflow for object ";
52 cerr
<< m_corpus_count
<< endl
;
53 check_enabled
= false;
59 corpus_count() : m_corpus_count(0) { }
60 corpus_count(int n
) : m_corpus_count(n
)
61 { check_underflow(); }
62 // copy constructor, assignment operator defined implicitly.
63 virtual ~corpus_count() { }
65 corpus_count
& operator=(int n
) { SetCorpusCount(n
); return *this; }
67 inline int GetCorpusCount() const { return m_corpus_count
; }
69 void IncrementCorpusCount(int incr
)
70 { m_corpus_count
+= incr
; check_underflow(); }
72 void SetCorpusCount(int n
)
73 { m_corpus_count
= n
; check_underflow(); }
76 #endif // CORPUSCOUNT_H