1 // Implementation of core CMiniLexicon methods
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
6 #include <Q3TextStream>
9 #include "GUIclasses.h"
11 #include "DescriptionLength.h"
14 #include "SignatureCollection.h"
15 #include "PrefixCollection.h"
16 #include "SuffixCollection.h"
17 #include "WordCollection.h"
18 #include "StemCollection.h"
19 #include "POSCollection.h"
20 #include "AffixLocation.h"
26 CMiniLexicon::CMiniLexicon(CLexicon
* lexicon
, int index
,
27 enum eAffixLocation affixLocation
)
28 : m_pLexicon(lexicon
),
30 m_AffixLocation(affixLocation
),
31 m_pWords(new CWordCollection(this)),
32 m_pSuffixes(is_initial(affixLocation
) ?
34 new CSuffixCollection(this)),
35 m_pPrefixes(is_initial(affixLocation
) ?
36 new CPrefixCollection(this) :
38 m_pStems(new CStemCollection(this)),
39 m_pSignatures(is_initial(affixLocation
) ?
40 new CSignatureCollection(this, m_pPrefixes
, affixLocation
) :
41 new CSignatureCollection(this, m_pSuffixes
, affixLocation
)),
43 m_DescriptionLength(),
44 m_CorpusCountOfUnanalyzedWords(0.0),
45 m_PhonologicalInformationOfUnanalyzedWords(0.0),
46 m_GUIWords(new GUIWordCollection(this, m_pWords
)),
51 CMiniLexicon::~CMiniLexicon()
53 // Update corpus words when deleting mini-lexicon
54 for (int i
= 0; i
< m_pWords
->GetCount(); ++i
) {
55 CStem
* word
= m_pWords
->GetAt(i
);
57 word
->SimplifyParseStructure();
58 m_pLexicon
->UpdateWord(word
);
60 m_pLexicon
->DoWordUpdates();
68 delete m_DescriptionLength
;
73 void CMiniLexicon::AddToScreen( QString msg
)
75 m_pLexicon
->AddToScreen( msg
);
78 GUIWordCollection
* CMiniLexicon::GetGUIWords()
83 int CMiniLexicon::GetCorpusCount()
85 return m_pLexicon
->GetCorpusCount();
89 int CMiniLexicon::GetNumberOfCharacterTypes()
91 return m_pLexicon
->GetNumberOfCharacterTypes();
95 CDLHistory
* CMiniLexicon::GetDLHistory()
97 return m_pLexicon
->GetDLHistory();
100 CStem
* CMiniLexicon::GetWordFromStemSuffix(CStem
* pStem
, CSuffix
* pSuffix
)
102 if (pSuffix
->Display() == TheStringNULL
)
104 return *m_pWords
^= pStem
->Display();
108 return *m_pWords
^= pStem
->Display() + pSuffix
->Display();
111 CStem
* CMiniLexicon::GetWordFromStemPrefix(CStem
* pStem
, CPrefix
* pPrefix
)
113 if (pPrefix
->Display() == TheStringNULL
)
115 return *m_pWords
^= pStem
->Display();
119 return *m_pWords
^= pPrefix
->Display() + pStem
->Display();
122 void CMiniLexicon::AddToWordCollection(CWordCollection
* pWords
,
123 enum which_words subset
)
125 CStem
* pWord
, * qWord
;
127 CStringSurrogate css
;
129 for( w
= 0; w
< (int) pWords
->GetCount(); w
++ )
131 pWord
= pWords
->GetAt(w
);
135 qWord
= (*m_pWords
) << pWord
;
136 qWord
->SetWordType( pWord
->GetWordType() );
138 case WW_AnalyzedOnly
:
139 if( pWord
->Size() > 1 )
141 qWord
= (*m_pWords
) << pWord
;
142 qWord
->SetWordType( pWord
->GetWordType() );
145 case WW_UnanalyzedOnly
:
147 if( pWord
->Size() <= 1 )
149 qWord
= (*m_pWords
) << pWord
;
150 qWord
->SetWordType( pWord
->GetWordType() );
158 void CMiniLexicon::AddToWordCollection( CStemCollection
* pWords
)
160 for (int w
= 0; w
< pWords
->GetCount(); ++w
) {
161 CStem
* pWord
= pWords
->GetAt(w
);
162 CStem
* qWord
= (*m_pWords
) << pWord
;
164 const bool compound
=
165 pWord
->GetStemType() == CStem::BIWORD_COMPOUND
||
166 pWord
->GetStemType() == CStem::MULTIPLE_COMPOUND
;
167 qWord
->SetWordType(compound
?
168 CStem::STEM_COMPOUND
: CStem::STEM_NORMAL
);
173 void CMiniLexicon::ClearAll()
175 if( m_pStems
) m_pStems
->Empty();
176 if( m_pWords
) m_pWords
->Empty();
178 if( m_pSuffixes
) m_pSuffixes
->Empty();
179 if( m_pPrefixes
) m_pPrefixes
->Empty();
181 if( m_pSignatures
) m_pSignatures
->Empty();
185 CCorpusWord
* CMiniLexicon::FindAWord(CStem
* pStem
, CSuffix
* pSuffix
)
187 return m_pLexicon
->FindAWord(pStem
, pSuffix
);
191 CSuffixCollection
* CMiniLexicon::FindSuffixes() //Suffixes/Run all
196 QString
mini_name( "Mini-Lexicon %1" );
197 mini_name
= mini_name
.arg( m_Index
);
201 if( m_AffixLocation
== STEM_FINAL
|| m_AffixLocation
== WORD_FINAL
)
203 m_pWords
->SuccessorFreq1(GetStems(),
204 GetSuffixes(), GetSignatures(), SF1
,
205 CStem::NUMBER
| CStem::UNKNOWN
);
211 ExtendKnownStemsToKnownAffixes();
212 TakeSignaturesFindStems();
213 TakeSignaturesFindStems();
214 ExtendKnownStemsToKnownAffixes();
215 FromStemsFindAffixes(); // problem here @@@ oct 2008 jg
221 FindSingletonSignatures(); //problem here jan 2010
225 FindMajorSignatures();
226 m_pWords
->m_DisplayMode
= CWordListViewItem::MiniLexicon_MorphologyStuffFirst
;
227 CalculateDescriptionLength();
232 this->m_pLexicon
->SetFSA(new FSA(this));
236 std::cout
<< "Find Suffixes: Time elapsed: " <<
237 t
.elapsed() << "ms." << std::endl
;
243 CPrefixCollection
* CMiniLexicon::FindPrefixes()
245 if( m_AffixLocation
== STEM_INITIAL
|| m_AffixLocation
== WORD_INITIAL
)
250 m_pWords
->CreateReverseTrie();
252 m_pWords
->PredecessorFreq1(GetStems(),
253 GetPrefixes(), GetSignatures(), SF1
,
254 CStem::NUMBER
| CStem::UNKNOWN
);
258 ExtendKnownStemsToKnownAffixes();
260 TakeSignaturesFindStems();
262 ExtendKnownStemsToKnownAffixes();
264 FromStemsFindAffixes();
266 ExtendKnownStemsToKnownAffixes();
272 FindSingletonSignatures();
274 std::cout
<< "Find Prefixes: Time elapsed: " <<
275 t
.elapsed() << "ms." << std::endl
;
277 if(this->m_pLexicon
->GetFSA())
278 this->m_pLexicon
->GetFSA()->AddPrefixes(this);
286 LinguisticaMainWindow
* CMiniLexicon::GetDocument()
288 return m_pLexicon
->GetDocument();
291 int CMiniLexicon::GetIntParameter( QString strParam
, int iDefault
)
293 return m_pLexicon
->GetIntParameter( strParam
, iDefault
);
297 QTextStream
* CMiniLexicon::GetLogFile()
299 return m_pLexicon
->GetLogFileStream();
303 int CMiniLexicon::GetMiniCount()
305 return m_pLexicon
->GetMiniCount();
309 int CMiniLexicon::GetMiniSize()
311 return m_pLexicon
->GetMiniSize();
315 CMiniLexicon
* CMiniLexicon::GetMiniLexicon( int index
)
317 return m_pLexicon
->GetMiniLexicon( index
);
321 StringToString
* CMiniLexicon::GetOutFilter()
323 return m_pLexicon
->GetOutFilter();
327 bool CMiniLexicon::LogFileOn()
329 return (CLexicon
*)m_pLexicon
->LogFileOn();
333 bool CMiniLexicon::SetAffixLocation(enum eAffixLocation affixLoc
)
335 if (m_AffixLocation
== affixLoc
)
336 // done, without lifting a finger!
339 if (is_initial(m_AffixLocation
) == is_initial(affixLoc
)) {
340 m_AffixLocation
= affixLoc
;
344 // Affix types differ: throw away discoveries.
346 if (m_pStems
->GetCount() != 0)
347 // someone else could be using our discovered stems
350 if (is_initial(affixLoc
)) {
351 Q_ASSERT(m_pPrefixes
== 0);
352 m_pPrefixes
= new CPrefixCollection(this);
360 Q_ASSERT(m_pSuffixes
== 0);
361 m_pSuffixes
= new CSuffixCollection(this);
364 m_AffixLocation
= affixLoc
;
366 delete m_pSignatures
;
367 m_pSignatures
= is_initial(affixLoc
) ?
368 new CSignatureCollection(this, m_pPrefixes
, affixLoc
) :
369 new CSignatureCollection(this, m_pSuffixes
, affixLoc
);
373 int CMiniLexicon::GetCorpusCountOfUnanalyzedWords ( )
378 m_CorpusCountOfUnanalyzedWords
= 0;
380 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
382 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
384 if ( 0 == m_pWords
->GetAt(i
)->GetSuffixSignature() )
386 m_CorpusCountOfUnanalyzedWords
+= m_pWords
->GetAt(i
)->GetCorpusCount();
390 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
393 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
395 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixSignature() )
397 m_CorpusCountOfUnanalyzedWords
+= m_pWords
->GetAt(i
)->GetCorpusCount();
402 return (int) m_CorpusCountOfUnanalyzedWords
;
404 double CMiniLexicon::CalculateSumOfPointersToMyUnanalyzedWords ( eMDL_STYLE MDLflag
)
408 double denominator
= 0;
415 denominator
= GetCorpusCountOfUnanalyzedWords ( ) +
416 m_pStems
->GetCorpusCount();
419 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
421 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
423 if ( 0 == m_pWords
->GetAt(i
)->GetSuffixLoc() )
425 total
+= base2log ( denominator
/ (double) m_pWords
->GetAt(i
)->GetCorpusCount() ) ;
429 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
432 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
434 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixLoc() )
436 total
+= base2log ( denominator
/ m_pWords
->GetAt(i
)->GetCorpusCount() ) ;
443 int NumberOfUnanalyzedWords
;
444 GetNumberOfAnalyzedWords(NumberOfUnanalyzedWords
);
446 denominator
= NumberOfUnanalyzedWords
+
447 m_pStems
->GetTotalUseCount ( );
449 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
451 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
453 if ( 0 == m_pWords
->GetAt(i
)->GetSuffixLoc() )
455 total
+= base2log ( denominator
) ;
459 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
462 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
464 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixLoc() )
466 total
+= base2log ( denominator
) ;
478 double CMiniLexicon::CalculateUnanalyzedWordsTotalPhonologicalInformationContent( )
480 CLexicon
* MotherLexicon
= GetLexicon();
482 if ( m_PhonologicalInformationOfUnanalyzedWords
== 0)
484 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
486 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
488 if ( 0 == m_pWords
->GetAt(i
)->GetSuffixLoc() )
490 m_PhonologicalInformationOfUnanalyzedWords
+= m_pWords
->GetAt(i
)->CalculatePhonologicalInformationContent( MotherLexicon
);
494 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
496 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
498 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixLoc() )
500 m_PhonologicalInformationOfUnanalyzedWords
+= m_pWords
->GetAt(i
)->CalculatePhonologicalInformationContent( MotherLexicon
);
507 return m_PhonologicalInformationOfUnanalyzedWords
;
512 double CMiniLexicon::CalculateCompressedLengthOfUnanalyzedWords( )
514 double CompressedLengthOfUnanalyzedWords
= 0;
515 CLexicon
* MotherLexicon
= GetLexicon();
518 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
520 for (i
= 0; i
< GetWords()->GetCount(); i
++)
522 if ( NULL
== GetWords()->GetAt(i
)->GetSuffixSignature() )
524 CompressedLengthOfUnanalyzedWords
+=
525 GetWords()->GetAt(i
)->CalculatePhonologicalInformationContent( MotherLexicon
)
526 * GetWords()->GetAt(i
)->GetCorpusCount();
530 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
532 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
534 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixSignature() )
536 CompressedLengthOfUnanalyzedWords
+=
537 m_pWords
->GetAt(i
)->CalculatePhonologicalInformationContent( MotherLexicon
)
538 * m_pWords
->GetAt(i
)->GetCorpusCount();
545 return CompressedLengthOfUnanalyzedWords
;
548 int CMiniLexicon::GetNumberOfAnalyzedWords (int& NumberOfUnanalyzedWords
)
551 int NumberOfAnalyzedWords
= 0;
552 NumberOfUnanalyzedWords
= 0;
553 for (int i
= 0;i
< m_pWords
->GetCount(); i
++)
555 pWord
= m_pWords
->GetAt(i
);
556 if (pWord
->IsAnalyzed() )
558 NumberOfAnalyzedWords
++;
561 NumberOfUnanalyzedWords
++;
564 return NumberOfAnalyzedWords
;
568 // Log File functions
572 void CMiniLexicon::LogFileHeader(QString s1
, QString s2
, QString s3
)
573 { if (LogFileOn()) *GetLogFile() <<
575 StartTableRow
<< MakeTableHeader(s1
) << MakeTableHeader(s2
) << MakeTableHeader(s3
) <<
579 void CMiniLexicon::LogFileSmallTitle(QString s1
, QString s2
, QString s3
)
580 { if (LogFileOn()) { *GetLogFile() << SmallTitle( s1
) <<
582 StartTableRow
<< MakeTableHeader(s2
) << MakeTableHeader(s3
) ;
585 void CMiniLexicon::LogFileSmallTitle(QString s
) { if (LogFileOn()) *GetLogFile() << SmallTitle( s
);}
586 void CMiniLexicon::LogFileSmallTitle(QString s
, QString t
) { if (LogFileOn()) *GetLogFile() << SmallTitle( s
) << StartTableRow
<< MakeTableHeader(t
) << EndTableRow
; }
587 void CMiniLexicon::LogFileLargeTitle(QString title
) { if (LogFileOn()) *GetLogFile() << LargeTitle(title
) << endl
; }
588 void CMiniLexicon::LogFileStartTable() { if (LogFileOn()) *GetLogFile() << StartTable
;}
589 void CMiniLexicon::LogFileEndTable() { if (LogFileOn()) *GetLogFile() << EndTable
;}
590 void CMiniLexicon::LogFileStartRow() { if (LogFileOn()) *GetLogFile() << StartTableRow
; }
591 void CMiniLexicon::LogFileEndRow() { if (LogFileOn()) *GetLogFile() << EndTableRow
; }
592 void CMiniLexicon::LogFileStartRow(QString str
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(str
); }
593 void CMiniLexicon::LogFile1SimpleString(QString s
) { if (LogFileOn()) *GetLogFile() << TableData(s
); }
594 void CMiniLexicon::LogFileSimpleString(QString s
) { if (LogFileOn()) *GetLogFile() << TableData(s
); }
595 void CMiniLexicon::LogFileSimpleDouble(double d
) { if (LogFileOn()) *GetLogFile() << TableData(d
); }
596 void CMiniLexicon::LogFileSimpleInteger(int n
) { if (LogFileOn()) *GetLogFile() << TableData(n
); }
597 void CMiniLexicon::LogFile (double d
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(d
) << EndTableRow
; }
598 void CMiniLexicon::LogFile (QString s
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s
) << EndTableRow
; }
599 void CMiniLexicon::LogFile (QString s
, int n
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s
) << TableData (n
) << EndTableRow
; }
600 void CMiniLexicon::LogFile (int n
, QString s
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(n
) << TableData(s
) << EndTableRow
; }
601 void CMiniLexicon::LogFile (QString s1
, QString s2
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s1
) << TableData(s2
) << EndTableRow
; }
602 void CMiniLexicon::LogFile (QString s1
, double d
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s1
) << TableData( d
) << EndTableRow
; }
603 void CMiniLexicon::LogFile (QString s
,int i
,double d
){ if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s
) << TableData(i
) << TableData( d
) << EndTableRow
; }
604 void CMiniLexicon::LogFile (QString s
, int n
, int m
, double d
, double e
, double f
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s
) << TableData(n
) << TableData(m
) << TableData(d
) << TableData(e
) << TableData(f
) << EndTableRow
; }
605 void CMiniLexicon::LogFile (QString s
, QString t
, QString u
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(s
) << TableData(t
) << TableData(u
) << EndTableRow
;}
606 void CMiniLexicon::LogFile (QString s
, QString t
, QString u
, QString v
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(s
) << TableData(t
) << TableData(u
) << TableData(v
) << EndTableRow
;}
607 void CMiniLexicon::LogFile (QString s
, QString t
, QString u
, QString v
, QString w
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(s
) << TableData(t
) << TableData(u
) << TableData(v
) << TableData(w
) << EndTableRow
;}
608 void CMiniLexicon::LogFile (QString s
, QString t
, QString u
, QString v
, QString w
, QString x
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(s
) << TableData(t
) << TableData(u
) << TableData(v
) << TableData(w
) << TableData (x
) << EndTableRow
;}
609 void CMiniLexicon::LogFileHeader( QString s
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< MakeTableHeader(s
) << EndTableRow
; }
610 void CMiniLexicon::LogFileHeader( QString s
, QString t
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< MakeTableHeader(s
) << MakeTableHeader(t
) << EndTableRow
; }
611 void CMiniLexicon::LogFileHeader (QString s
, QString t
, QString u
, QString v
, QString w
, QString x
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< MakeTableHeader(s
) << MakeTableHeader(t
) << MakeTableHeader(u
) << MakeTableHeader(v
) << MakeTableHeader(w
) << MakeTableHeader (x
) << EndTableRow
;}
612 void CMiniLexicon::LogFile (int n
, double d
, QString s
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(n
) << TableData(d
) << TableData(s
) << EndTableRow
;}