CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / MiniLexicon_CheckAffixes.cpp
blob047d2d968bb09d9e490ef16ab0037c7a5e0a0e66
1 // Reconsidering discovered suffix-based morphology
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
5 #include <memory>
6 #include "ui/Status.h"
7 #include "DLHistory.h"
8 #include "Lexicon.h"
9 #include "Signature.h"
10 #include "Suffix.h"
11 #include "Prefix.h"
12 #include "Affix.h"
13 #include "Stem.h"
14 #include "SignatureCollection.h"
15 #include "SuffixCollection.h"
16 #include "PrefixCollection.h"
17 #include "StemCollection.h"
18 #include "WordCollection.h"
19 #include "HTML.h"
21 void CMiniLexicon::CheckSignatures() // Suffixes/Check signatures
23 int NumberOfLettersToShift = 0;
24 int InternalCount = 0;
25 int LoopCount = 0;
26 int LoopLimit = m_pLexicon->GetIntParameter( "CheckSignatures\\LoopLimit", 1 ); // 3 );
27 //int i;
28 CSignature* pSig;
29 CStem* pStem;
30 QString Null = "NULL", msg;
31 CStringSurrogate ssStem;
32 CParse PWord;
33 const int StemCountThreshold = m_pLexicon->GetIntParameter( "CheckSignatures\\StemCountThreshold", 2 );
34 CStringSurrogate ssAffix;
35 bool analyzingSuffixes = TRUE;
36 if( m_AffixLocation == STEM_INITIAL || m_AffixLocation == WORD_INITIAL ) analyzingSuffixes = FALSE;
38 std::auto_ptr<CSignatureCollection> Actions(analyzingSuffixes ?
39 new CSignatureCollection(this, m_pSuffixes, m_AffixLocation ) :
40 new CSignatureCollection(this, m_pPrefixes, m_AffixLocation ));
42 linguistica::ui::status_user_agent& status = m_pLexicon->status_display();
43 status.major_operation = QString(
44 "Mini-Lexicon %1: Check signatures: stem/suffix edge.")
45 .arg(m_Index+1);
46 status.progress.clear();
48 QHash<CSignature*, int> HowManyLettersToShift;
49 QList<CSignature*> SignaturesToModify;
51 if (analyzingSuffixes) { LogFileLargeTitle("Phase: Check signatures (stem/suffix edge"); }
52 else { LogFileLargeTitle("Phase: Check signatures (prefix/stem edge"); }
54 //======================================================================//
55 // Principal loop, through Signatures
56 //======================================================================//
58 while ( LoopCount < LoopLimit )
60 LoopCount++;
61 SignaturesToModify.clear();
62 InternalCount = 0;
63 m_pSignatures->Sort(SIGS);
66 //----------------------------------------------------------------//
67 // Call to "CheckOut" to check each signature.
68 //----------------------------------------------------------------//
70 status.progress.set_denominator(m_pSignatures->GetCount());
71 for ( int signo = 0; signo < (int)m_pSignatures->GetCount(); signo++)
73 msg = QString("%1").arg(LoopCount) + ": " + QString("%1").arg( m_pSignatures->GetCount() - signo );
74 status.details = msg;
75 status.progress = signo;
77 pSig = m_pSignatures->GetAtSort(signo);
78 pSig->SetAffixLocation( m_AffixLocation );
80 if ( pSig->GetNumberOfStems() < StemCountThreshold ) { continue; }
82 //==========================================================
83 NumberOfLettersToShift = pSig->CheckOut(this);
84 //==========================================================
86 if ( NumberOfLettersToShift > 0)
88 InternalCount ++;
89 SignaturesToModify.append(pSig);
90 HowManyLettersToShift.insert (pSig,NumberOfLettersToShift);
92 } // end of signo loop
94 if (InternalCount == 0) {
95 // There are no signatures being modified.
96 // Leave function.
97 status.details.clear();
98 // XXX. not really an operation.
99 status.major_operation = (analyzingSuffixes ?
100 QString("Mini-Lexicon %1: End of Check signatures: stem/suffix edge.") :
101 QString("Mini-Lexicon %1: End of Check signatures: prefix/stem edge."))
102 .arg(m_Index+1);
103 LogFile("No signatures to modify now");
104 return;
108 //----------------------------------------------------------------//
109 // Section *A*
110 // Now we make the changes in the words which we have identified above.
111 // Bear in mind that the (positive or negative) integer in Sig.CorpusCount is the number of
112 // letters to the right or left that the stem/suffix cut should be shifted.
114 //----------------------------------------------------------------//
116 LogFileSmallTitle("Remaking signatures");
117 QString newstem;
118 for (int signo = 0; signo < (int)SignaturesToModify.size(); signo++)
120 pSig = SignaturesToModify.at(signo);
121 int NumberOfLettersShifted = HowManyLettersToShift.value(pSig);
122 LogFileSmallTitle(pSig->Display());
123 LogFileStartTable();
124 LogFileHeader("New signature", "Old stem", "New stem");
125 for (int stemno = 0; stemno < pSig->GetNumberOfStems(); stemno++)
127 pStem = pSig->GetStem(stemno);
128 ssStem = pStem->GetKey();
130 for (int affixno = 1; affixno <= pSig->Size(); affixno++)
132 ssAffix = pSig->GetPiece(affixno);
133 if ( ssAffix == CStringSurrogate(Null) )
135 ssAffix.MakeNull();
138 if( analyzingSuffixes ) PWord = ssStem + ssAffix;
139 else PWord = ssAffix + ssStem;
141 CStem* pWord = *m_pWords ^= PWord;
143 if( analyzingSuffixes )
145 if( ssAffix.GetLength() == 0 && pWord && pWord->GetSuffixLoc() > 0 ) continue; // the stem has an internal analysis already 3/2003
147 else
149 if( ssAffix.GetLength() == 0 && pWord && pWord->GetStemLoc() > 0 ) continue;
152 if (pWord->GetWordType() == CStem::BIWORD_COMPOUND ||
153 pWord->GetWordType() == CStem::MULTIPLE_COMPOUND ||
154 pWord->GetWordType() == CStem::POSSIBLE_COMPOUND)
155 continue;
157 Q_ASSERT ( pWord->IsValid() );
159 if ( (int)ssStem.GetLength() <= NumberOfLettersShifted ) { continue; } ;
160 // TODO: do the same thing below for prefixes if necessary
162 if( analyzingSuffixes ) pWord->ShiftStemSuffixBoundary ( -1 * NumberOfLettersShifted );
163 //else pWord->ShiftPrefixStemBoundary( pSig->GetCorpusCount() );
165 Q_ASSERT ( pWord->IsValid() );
166 newstem = pWord->GetStem().Display();
167 }// end of affixno loop
168 LogFile("", ssStem.Display(), newstem);
171 LogFileEndTable();
172 } //end of signo loop
175 //----------------------------------------------------------------//
176 // End of Section *A*
178 //----------------------------------------------------------------//
179 LogFileEndTable();
180 } // end of LoopCount loop;
182 //-------------------------------------------------------------//
183 //////////////////////////////////////////////////////////////////////////////
184 // Redo Signatures
185 QString Remark ("Checking signatures");
186 CStringSurrogate ssRemark ( Remark);
187 RebuildAffixesStemsAndSignaturesFromWordSplits( ssRemark );
190 // XXX. not an operation
191 status.major_operation = (analyzingSuffixes ?
192 QString("Mini-Lexicon %1: "
193 "End of Check signatures: stem/suffix edge.") :
194 QString("Mini-Lexicon %1: "
195 "End of Check signatures: prefix/stem edge." ))
196 .arg(m_Index + 1);
197 status.progress.clear();
198 status.details.clear();
200 QString mini_name( "Mini-Lexicon %1" );
201 mini_name = mini_name.arg( GetIndex() + 1 );
202 QString remark = "Check stem/suffix cut";
203 GetDLHistory()->append( mini_name, remark, this );