CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / MiniLexicon.cpp
blobdb2fe6c9a969347f915a5bed4248bf9abdb3c741
1 // Implementation of core CMiniLexicon methods
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
5 #include <iostream>
6 #include <Q3TextStream>
7 #include <QTime>
8 #include "Lexicon.h"
9 #include "GUIclasses.h"
10 #include "FSA.h"
11 #include "DescriptionLength.h"
12 #include "Datum.h"
13 #include "Stem.h"
14 #include "SignatureCollection.h"
15 #include "PrefixCollection.h"
16 #include "SuffixCollection.h"
17 #include "WordCollection.h"
18 #include "StemCollection.h"
19 #include "POSCollection.h"
20 #include "AffixLocation.h"
21 #include "log2.h"
22 #include "Suffix.h"
23 #include "Prefix.h"
24 #include "HTML.h"
26 CMiniLexicon::CMiniLexicon(CLexicon* lexicon, int index,
27 enum eAffixLocation affixLocation)
28 : m_pLexicon(lexicon),
29 m_Index(index),
30 m_AffixLocation(affixLocation),
31 m_pWords(new CWordCollection(this)),
32 m_pSuffixes(is_initial(affixLocation) ?
33 0 :
34 new CSuffixCollection(this)),
35 m_pPrefixes(is_initial(affixLocation) ?
36 new CPrefixCollection(this) :
37 0),
38 m_pStems(new CStemCollection(this)),
39 m_pSignatures(is_initial(affixLocation) ?
40 new CSignatureCollection(this, m_pPrefixes, affixLocation) :
41 new CSignatureCollection(this, m_pSuffixes, affixLocation)),
42 m_pPOS(),
43 m_DescriptionLength(),
44 m_CorpusCountOfUnanalyzedWords(0.0),
45 m_PhonologicalInformationOfUnanalyzedWords(0.0),
46 m_GUIWords(new GUIWordCollection(this, m_pWords)),
47 m_DataMap()
48 //m_pFSA(0)
49 { }
51 CMiniLexicon::~CMiniLexicon()
53 // Update corpus words when deleting mini-lexicon
54 for (int i = 0; i < m_pWords->GetCount(); ++i) {
55 CStem* word = m_pWords->GetAt(i);
57 word->SimplifyParseStructure();
58 m_pLexicon->UpdateWord(word);
60 m_pLexicon->DoWordUpdates();
62 delete m_pWords;
63 delete m_pSuffixes;
64 delete m_pPrefixes;
65 delete m_pStems;
66 delete m_pSignatures;
67 delete m_pPOS;
68 delete m_DescriptionLength;
69 delete m_GUIWords;
70 //delete m_pFSA;
73 void CMiniLexicon::AddToScreen( QString msg )
75 m_pLexicon->AddToScreen( msg );
78 GUIWordCollection* CMiniLexicon::GetGUIWords()
80 return m_GUIWords;
83 int CMiniLexicon::GetCorpusCount()
85 return m_pLexicon->GetCorpusCount();
89 int CMiniLexicon::GetNumberOfCharacterTypes()
91 return m_pLexicon->GetNumberOfCharacterTypes();
95 CDLHistory* CMiniLexicon::GetDLHistory()
97 return m_pLexicon->GetDLHistory();
100 CStem* CMiniLexicon::GetWordFromStemSuffix(CStem* pStem, CSuffix* pSuffix)
102 if (pSuffix->Display() == TheStringNULL)
104 return *m_pWords ^= pStem->Display();
106 else
108 return *m_pWords ^= pStem->Display() + pSuffix->Display();
111 CStem* CMiniLexicon::GetWordFromStemPrefix(CStem* pStem, CPrefix* pPrefix)
113 if (pPrefix->Display() == TheStringNULL)
115 return *m_pWords ^= pStem->Display();
117 else
119 return *m_pWords ^= pPrefix->Display() + pStem->Display();
122 void CMiniLexicon::AddToWordCollection(CWordCollection* pWords,
123 enum which_words subset)
125 CStem* pWord, * qWord;
126 int w;
127 CStringSurrogate css;
129 for( w = 0; w < (int) pWords->GetCount(); w++ )
131 pWord = pWords->GetAt(w);
133 switch (subset) {
134 case WW_All:
135 qWord = (*m_pWords) << pWord;
136 qWord->SetWordType( pWord->GetWordType() );
137 break;
138 case WW_AnalyzedOnly:
139 if( pWord->Size() > 1 )
141 qWord = (*m_pWords) << pWord;
142 qWord->SetWordType( pWord->GetWordType() );
144 break;
145 case WW_UnanalyzedOnly:
146 default:
147 if( pWord->Size() <= 1 )
149 qWord = (*m_pWords) << pWord;
150 qWord->SetWordType( pWord->GetWordType() );
152 break;
158 void CMiniLexicon::AddToWordCollection( CStemCollection* pWords )
160 for (int w = 0; w < pWords->GetCount(); ++w) {
161 CStem* pWord = pWords->GetAt(w);
162 CStem* qWord = (*m_pWords) << pWord;
164 const bool compound =
165 pWord->GetStemType() == CStem::BIWORD_COMPOUND ||
166 pWord->GetStemType() == CStem::MULTIPLE_COMPOUND;
167 qWord->SetWordType(compound ?
168 CStem::STEM_COMPOUND : CStem::STEM_NORMAL);
173 void CMiniLexicon::ClearAll()
175 if( m_pStems ) m_pStems->Empty();
176 if( m_pWords ) m_pWords->Empty();
178 if( m_pSuffixes ) m_pSuffixes->Empty();
179 if( m_pPrefixes ) m_pPrefixes->Empty();
181 if( m_pSignatures ) m_pSignatures->Empty();
185 CCorpusWord* CMiniLexicon::FindAWord(CStem* pStem, CSuffix* pSuffix)
187 return m_pLexicon->FindAWord(pStem, pSuffix);
191 CSuffixCollection* CMiniLexicon::FindSuffixes() //Suffixes/Run all
193 QTime t;
194 t.start();
196 QString mini_name( "Mini-Lexicon %1" );
197 mini_name = mini_name.arg( m_Index );
199 QString remark;
201 if( m_AffixLocation == STEM_FINAL || m_AffixLocation == WORD_FINAL )
203 m_pWords->SuccessorFreq1(GetStems(),
204 GetSuffixes(), GetSignatures(), SF1,
205 CStem::NUMBER | CStem::UNKNOWN);
208 CheckSignatures();
211 ExtendKnownStemsToKnownAffixes();
212 TakeSignaturesFindStems();
213 TakeSignaturesFindStems();
214 ExtendKnownStemsToKnownAffixes();
215 FromStemsFindAffixes(); // problem here @@@ oct 2008 jg
217 LooseFit();
219 CheckSignatures();
221 FindSingletonSignatures(); //problem here jan 2010
223 CheckSignatures();
225 FindMajorSignatures();
226 m_pWords->m_DisplayMode = CWordListViewItem::MiniLexicon_MorphologyStuffFirst;
227 CalculateDescriptionLength();
231 FindAllomorphy();
232 this->m_pLexicon->SetFSA(new FSA(this));
236 std::cout << "Find Suffixes: Time elapsed: " <<
237 t.elapsed() << "ms." << std::endl;
239 return m_pSuffixes;
243 CPrefixCollection* CMiniLexicon::FindPrefixes()
245 if( m_AffixLocation == STEM_INITIAL || m_AffixLocation == WORD_INITIAL )
247 QTime t;
248 t.start();
250 m_pWords->CreateReverseTrie();
252 m_pWords->PredecessorFreq1(GetStems(),
253 GetPrefixes(), GetSignatures(), SF1,
254 CStem::NUMBER | CStem::UNKNOWN);
256 CheckSignatures();
258 ExtendKnownStemsToKnownAffixes();
260 TakeSignaturesFindStems();
262 ExtendKnownStemsToKnownAffixes();
264 FromStemsFindAffixes();
266 ExtendKnownStemsToKnownAffixes();
268 LooseFit();
270 CheckSignatures();
272 FindSingletonSignatures();
274 std::cout << "Find Prefixes: Time elapsed: " <<
275 t.elapsed() << "ms." << std::endl;
277 if(this->m_pLexicon->GetFSA())
278 this->m_pLexicon->GetFSA()->AddPrefixes(this);
280 return m_pPrefixes;
282 else return NULL;
286 LinguisticaMainWindow* CMiniLexicon::GetDocument()
288 return m_pLexicon->GetDocument();
291 int CMiniLexicon::GetIntParameter( QString strParam, int iDefault )
293 return m_pLexicon->GetIntParameter( strParam, iDefault );
297 QTextStream* CMiniLexicon::GetLogFile()
299 return m_pLexicon->GetLogFileStream();
303 int CMiniLexicon::GetMiniCount()
305 return m_pLexicon->GetMiniCount();
309 int CMiniLexicon::GetMiniSize()
311 return m_pLexicon->GetMiniSize();
315 CMiniLexicon* CMiniLexicon::GetMiniLexicon( int index )
317 return m_pLexicon->GetMiniLexicon( index );
321 StringToString* CMiniLexicon::GetOutFilter()
323 return m_pLexicon->GetOutFilter();
327 bool CMiniLexicon::LogFileOn()
329 return (CLexicon*)m_pLexicon->LogFileOn();
333 bool CMiniLexicon::SetAffixLocation(enum eAffixLocation affixLoc)
335 if (m_AffixLocation == affixLoc)
336 // done, without lifting a finger!
337 return true;
339 if (is_initial(m_AffixLocation) == is_initial(affixLoc)) {
340 m_AffixLocation = affixLoc;
341 return true;
344 // Affix types differ: throw away discoveries.
346 if (m_pStems->GetCount() != 0)
347 // someone else could be using our discovered stems
348 return false;
350 if (is_initial(affixLoc)) {
351 Q_ASSERT(m_pPrefixes == 0);
352 m_pPrefixes = new CPrefixCollection(this);
354 delete m_pSuffixes;
355 m_pSuffixes = 0;
356 } else {
357 delete m_pPrefixes;
358 m_pPrefixes = 0;
360 Q_ASSERT(m_pSuffixes == 0);
361 m_pSuffixes = new CSuffixCollection(this);
364 m_AffixLocation = affixLoc;
366 delete m_pSignatures;
367 m_pSignatures = is_initial(affixLoc) ?
368 new CSignatureCollection(this, m_pPrefixes, affixLoc) :
369 new CSignatureCollection(this, m_pSuffixes, affixLoc);
370 return true;
373 int CMiniLexicon::GetCorpusCountOfUnanalyzedWords ( )
375 int i = 0;
378 m_CorpusCountOfUnanalyzedWords = 0;
380 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
382 for (i = 0; i < m_pWords->GetCount(); i++)
384 if ( 0 == m_pWords->GetAt(i)->GetSuffixSignature() )
386 m_CorpusCountOfUnanalyzedWords += m_pWords->GetAt(i)->GetCorpusCount();
390 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
393 for (i = 0; i < m_pWords->GetCount(); i++)
395 if ( 0 == m_pWords->GetAt(i)->GetPrefixSignature() )
397 m_CorpusCountOfUnanalyzedWords += m_pWords->GetAt(i)->GetCorpusCount();
402 return (int) m_CorpusCountOfUnanalyzedWords;
404 double CMiniLexicon::CalculateSumOfPointersToMyUnanalyzedWords ( eMDL_STYLE MDLflag)
406 int i = 0;
407 double total = 0;
408 double denominator = 0;
410 switch (MDLflag)
412 case CorpusCount:
415 denominator = GetCorpusCountOfUnanalyzedWords ( ) +
416 m_pStems->GetCorpusCount();
419 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
421 for (i = 0; i < m_pWords->GetCount(); i++)
423 if ( 0 == m_pWords->GetAt(i)->GetSuffixLoc() )
425 total += base2log ( denominator / (double) m_pWords->GetAt(i)->GetCorpusCount() ) ;
429 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
432 for (i = 0; i < m_pWords->GetCount(); i++)
434 if ( 0 == m_pWords->GetAt(i)->GetPrefixLoc() )
436 total += base2log ( denominator / m_pWords->GetAt(i)->GetCorpusCount() ) ;
440 break;
442 case GrammarCount:
443 int NumberOfUnanalyzedWords;
444 GetNumberOfAnalyzedWords(NumberOfUnanalyzedWords);
446 denominator = NumberOfUnanalyzedWords +
447 m_pStems->GetTotalUseCount ( );
449 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
451 for (i = 0; i < m_pWords->GetCount(); i++)
453 if ( 0 == m_pWords->GetAt(i)->GetSuffixLoc() )
455 total += base2log ( denominator ) ;
459 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
462 for (i = 0; i < m_pWords->GetCount(); i++)
464 if ( 0 == m_pWords->GetAt(i)->GetPrefixLoc() )
466 total += base2log ( denominator ) ;
472 break;
475 return total;
478 double CMiniLexicon::CalculateUnanalyzedWordsTotalPhonologicalInformationContent( )
480 CLexicon* MotherLexicon = GetLexicon();
481 int i;
482 if ( m_PhonologicalInformationOfUnanalyzedWords == 0)
484 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
486 for (i = 0; i < m_pWords->GetCount(); i++)
488 if ( 0 == m_pWords->GetAt(i)->GetSuffixLoc() )
490 m_PhonologicalInformationOfUnanalyzedWords += m_pWords->GetAt(i)->CalculatePhonologicalInformationContent( MotherLexicon );
494 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
496 for (i = 0; i < m_pWords->GetCount(); i++)
498 if ( 0 == m_pWords->GetAt(i)->GetPrefixLoc() )
500 m_PhonologicalInformationOfUnanalyzedWords += m_pWords->GetAt(i)->CalculatePhonologicalInformationContent( MotherLexicon );
507 return m_PhonologicalInformationOfUnanalyzedWords;
512 double CMiniLexicon::CalculateCompressedLengthOfUnanalyzedWords( )
514 double CompressedLengthOfUnanalyzedWords = 0;
515 CLexicon* MotherLexicon = GetLexicon();
516 int i;
518 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
520 for (i = 0; i < GetWords()->GetCount(); i++)
522 if ( NULL == GetWords()->GetAt(i)->GetSuffixSignature() )
524 CompressedLengthOfUnanalyzedWords +=
525 GetWords()->GetAt(i)->CalculatePhonologicalInformationContent( MotherLexicon )
526 * GetWords()->GetAt(i)->GetCorpusCount();
530 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
532 for (i = 0; i < m_pWords->GetCount(); i++)
534 if ( 0 == m_pWords->GetAt(i)->GetPrefixSignature() )
536 CompressedLengthOfUnanalyzedWords +=
537 m_pWords->GetAt(i)->CalculatePhonologicalInformationContent( MotherLexicon )
538 * m_pWords->GetAt(i)->GetCorpusCount();
545 return CompressedLengthOfUnanalyzedWords;
548 int CMiniLexicon::GetNumberOfAnalyzedWords (int& NumberOfUnanalyzedWords)
550 CStem* pWord;
551 int NumberOfAnalyzedWords = 0;
552 NumberOfUnanalyzedWords = 0;
553 for (int i = 0;i < m_pWords->GetCount(); i++)
555 pWord = m_pWords->GetAt(i);
556 if (pWord->IsAnalyzed() )
558 NumberOfAnalyzedWords++;
559 } else
561 NumberOfUnanalyzedWords++;
564 return NumberOfAnalyzedWords;
568 // Log File functions
572 void CMiniLexicon::LogFileHeader(QString s1, QString s2, QString s3)
573 { if (LogFileOn()) *GetLogFile() <<
574 StartTable <<
575 StartTableRow << MakeTableHeader(s1) << MakeTableHeader(s2) << MakeTableHeader(s3) <<
576 EndTableRow;
579 void CMiniLexicon::LogFileSmallTitle(QString s1, QString s2, QString s3)
580 { if (LogFileOn()) { *GetLogFile() << SmallTitle( s1) <<
581 StartTable <<
582 StartTableRow << MakeTableHeader(s2) << MakeTableHeader(s3) ;
585 void CMiniLexicon::LogFileSmallTitle(QString s) { if (LogFileOn()) *GetLogFile() << SmallTitle( s );}
586 void CMiniLexicon::LogFileSmallTitle(QString s, QString t) { if (LogFileOn()) *GetLogFile() << SmallTitle( s ) << StartTableRow << MakeTableHeader(t) << EndTableRow; }
587 void CMiniLexicon::LogFileLargeTitle(QString title) { if (LogFileOn()) *GetLogFile() << LargeTitle(title) << endl; }
588 void CMiniLexicon::LogFileStartTable() { if (LogFileOn()) *GetLogFile() << StartTable;}
589 void CMiniLexicon::LogFileEndTable() { if (LogFileOn()) *GetLogFile() << EndTable;}
590 void CMiniLexicon::LogFileStartRow() { if (LogFileOn()) *GetLogFile() << StartTableRow; }
591 void CMiniLexicon::LogFileEndRow() { if (LogFileOn()) *GetLogFile() << EndTableRow; }
592 void CMiniLexicon::LogFileStartRow(QString str) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(str); }
593 void CMiniLexicon::LogFile1SimpleString(QString s) { if (LogFileOn()) *GetLogFile() << TableData(s); }
594 void CMiniLexicon::LogFileSimpleString(QString s) { if (LogFileOn()) *GetLogFile() << TableData(s); }
595 void CMiniLexicon::LogFileSimpleDouble(double d) { if (LogFileOn()) *GetLogFile() << TableData(d); }
596 void CMiniLexicon::LogFileSimpleInteger(int n) { if (LogFileOn()) *GetLogFile() << TableData(n); }
597 void CMiniLexicon::LogFile (double d) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(d) << EndTableRow; }
598 void CMiniLexicon::LogFile (QString s) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s) << EndTableRow; }
599 void CMiniLexicon::LogFile (QString s, int n) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s) << TableData (n) << EndTableRow; }
600 void CMiniLexicon::LogFile (int n, QString s) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(n) << TableData(s) << EndTableRow; }
601 void CMiniLexicon::LogFile (QString s1, QString s2) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s1) << TableData(s2) << EndTableRow; }
602 void CMiniLexicon::LogFile (QString s1, double d) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s1) << TableData( d) << EndTableRow; }
603 void CMiniLexicon::LogFile (QString s,int i,double d){ if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s) << TableData(i) << TableData( d) << EndTableRow; }
604 void CMiniLexicon::LogFile (QString s, int n, int m, double d, double e, double f) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s) << TableData(n) << TableData(m) << TableData(d) << TableData(e) << TableData(f) << EndTableRow; }
605 void CMiniLexicon::LogFile (QString s, QString t, QString u) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(s) << TableData(t) << TableData(u) << EndTableRow;}
606 void CMiniLexicon::LogFile (QString s, QString t, QString u, QString v) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(s) << TableData(t) << TableData(u) << TableData(v) << EndTableRow;}
607 void CMiniLexicon::LogFile (QString s, QString t, QString u, QString v, QString w) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(s) << TableData(t) << TableData(u) << TableData(v) << TableData(w) << EndTableRow;}
608 void CMiniLexicon::LogFile (QString s, QString t, QString u, QString v, QString w, QString x) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(s) << TableData(t) << TableData(u) << TableData(v) << TableData(w) << TableData (x) << EndTableRow;}
609 void CMiniLexicon::LogFileHeader( QString s) { if (LogFileOn()) *GetLogFile() << StartTableRow << MakeTableHeader(s) << EndTableRow; }
610 void CMiniLexicon::LogFileHeader( QString s, QString t) { if (LogFileOn()) *GetLogFile() << StartTableRow << MakeTableHeader(s) << MakeTableHeader(t) << EndTableRow; }
611 void CMiniLexicon::LogFileHeader (QString s, QString t, QString u, QString v, QString w, QString x) { if (LogFileOn()) *GetLogFile() << StartTableRow << MakeTableHeader(s) << MakeTableHeader(t) << MakeTableHeader(u) << MakeTableHeader(v) << MakeTableHeader(w) << MakeTableHeader (x) << EndTableRow;}
612 void CMiniLexicon::LogFile (int n, double d, QString s) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(n) << TableData(d) << TableData(s) << EndTableRow;}