1 // Reconsidering discovered suffix-based morphology
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
14 #include "SignatureCollection.h"
15 #include "SuffixCollection.h"
16 #include "PrefixCollection.h"
17 #include "StemCollection.h"
18 #include "WordCollection.h"
21 void CMiniLexicon::CheckSignatures() // Suffixes/Check signatures
23 int NumberOfLettersToShift
= 0;
24 int InternalCount
= 0;
26 int LoopLimit
= m_pLexicon
->GetIntParameter( "CheckSignatures\\LoopLimit", 1 ); // 3 );
30 QString Null
= "NULL", msg
;
31 CStringSurrogate ssStem
;
33 const int StemCountThreshold
= m_pLexicon
->GetIntParameter( "CheckSignatures\\StemCountThreshold", 2 );
34 CStringSurrogate ssAffix
;
35 bool analyzingSuffixes
= TRUE
;
36 if( m_AffixLocation
== STEM_INITIAL
|| m_AffixLocation
== WORD_INITIAL
) analyzingSuffixes
= FALSE
;
38 std::auto_ptr
<CSignatureCollection
> Actions(analyzingSuffixes
?
39 new CSignatureCollection(this, m_pSuffixes
, m_AffixLocation
) :
40 new CSignatureCollection(this, m_pPrefixes
, m_AffixLocation
));
42 linguistica::ui::status_user_agent
& status
= m_pLexicon
->status_display();
43 status
.major_operation
= QString(
44 "Mini-Lexicon %1: Check signatures: stem/suffix edge.")
46 status
.progress
.clear();
48 QHash
<CSignature
*, int> HowManyLettersToShift
;
49 QList
<CSignature
*> SignaturesToModify
;
51 if (analyzingSuffixes
) { LogFileLargeTitle("Phase: Check signatures (stem/suffix edge"); }
52 else { LogFileLargeTitle("Phase: Check signatures (prefix/stem edge"); }
54 //======================================================================//
55 // Principal loop, through Signatures
56 //======================================================================//
58 while ( LoopCount
< LoopLimit
)
61 SignaturesToModify
.clear();
63 m_pSignatures
->Sort(SIGS
);
66 //----------------------------------------------------------------//
67 // Call to "CheckOut" to check each signature.
68 //----------------------------------------------------------------//
70 status
.progress
.set_denominator(m_pSignatures
->GetCount());
71 for ( int signo
= 0; signo
< (int)m_pSignatures
->GetCount(); signo
++)
73 msg
= QString("%1").arg(LoopCount
) + ": " + QString("%1").arg( m_pSignatures
->GetCount() - signo
);
75 status
.progress
= signo
;
77 pSig
= m_pSignatures
->GetAtSort(signo
);
78 pSig
->SetAffixLocation( m_AffixLocation
);
80 if ( pSig
->GetNumberOfStems() < StemCountThreshold
) { continue; }
82 //==========================================================
83 NumberOfLettersToShift
= pSig
->CheckOut(this);
84 //==========================================================
86 if ( NumberOfLettersToShift
> 0)
89 SignaturesToModify
.append(pSig
);
90 HowManyLettersToShift
.insert (pSig
,NumberOfLettersToShift
);
92 } // end of signo loop
94 if (InternalCount
== 0) {
95 // There are no signatures being modified.
97 status
.details
.clear();
98 // XXX. not really an operation.
99 status
.major_operation
= (analyzingSuffixes
?
100 QString("Mini-Lexicon %1: End of Check signatures: stem/suffix edge.") :
101 QString("Mini-Lexicon %1: End of Check signatures: prefix/stem edge."))
103 LogFile("No signatures to modify now");
108 //----------------------------------------------------------------//
110 // Now we make the changes in the words which we have identified above.
111 // Bear in mind that the (positive or negative) integer in Sig.CorpusCount is the number of
112 // letters to the right or left that the stem/suffix cut should be shifted.
114 //----------------------------------------------------------------//
116 LogFileSmallTitle("Remaking signatures");
118 for (int signo
= 0; signo
< (int)SignaturesToModify
.size(); signo
++)
120 pSig
= SignaturesToModify
.at(signo
);
121 int NumberOfLettersShifted
= HowManyLettersToShift
.value(pSig
);
122 LogFileSmallTitle(pSig
->Display());
124 LogFileHeader("New signature", "Old stem", "New stem");
125 for (int stemno
= 0; stemno
< pSig
->GetNumberOfStems(); stemno
++)
127 pStem
= pSig
->GetStem(stemno
);
128 ssStem
= pStem
->GetKey();
130 for (int affixno
= 1; affixno
<= pSig
->Size(); affixno
++)
132 ssAffix
= pSig
->GetPiece(affixno
);
133 if ( ssAffix
== CStringSurrogate(Null
) )
138 if( analyzingSuffixes
) PWord
= ssStem
+ ssAffix
;
139 else PWord
= ssAffix
+ ssStem
;
141 CStem
* pWord
= *m_pWords
^= PWord
;
143 if( analyzingSuffixes
)
145 if( ssAffix
.GetLength() == 0 && pWord
&& pWord
->GetSuffixLoc() > 0 ) continue; // the stem has an internal analysis already 3/2003
149 if( ssAffix
.GetLength() == 0 && pWord
&& pWord
->GetStemLoc() > 0 ) continue;
152 if (pWord
->GetWordType() == CStem::BIWORD_COMPOUND
||
153 pWord
->GetWordType() == CStem::MULTIPLE_COMPOUND
||
154 pWord
->GetWordType() == CStem::POSSIBLE_COMPOUND
)
157 Q_ASSERT ( pWord
->IsValid() );
159 if ( (int)ssStem
.GetLength() <= NumberOfLettersShifted
) { continue; } ;
160 // TODO: do the same thing below for prefixes if necessary
162 if( analyzingSuffixes
) pWord
->ShiftStemSuffixBoundary ( -1 * NumberOfLettersShifted
);
163 //else pWord->ShiftPrefixStemBoundary( pSig->GetCorpusCount() );
165 Q_ASSERT ( pWord
->IsValid() );
166 newstem
= pWord
->GetStem().Display();
167 }// end of affixno loop
168 LogFile("", ssStem
.Display(), newstem
);
172 } //end of signo loop
175 //----------------------------------------------------------------//
176 // End of Section *A*
178 //----------------------------------------------------------------//
180 } // end of LoopCount loop;
182 //-------------------------------------------------------------//
183 //////////////////////////////////////////////////////////////////////////////
185 QString
Remark ("Checking signatures");
186 CStringSurrogate
ssRemark ( Remark
);
187 RebuildAffixesStemsAndSignaturesFromWordSplits( ssRemark
);
190 // XXX. not an operation
191 status
.major_operation
= (analyzingSuffixes
?
192 QString("Mini-Lexicon %1: "
193 "End of Check signatures: stem/suffix edge.") :
194 QString("Mini-Lexicon %1: "
195 "End of Check signatures: prefix/stem edge." ))
197 status
.progress
.clear();
198 status
.details
.clear();
200 QString
mini_name( "Mini-Lexicon %1" );
201 mini_name
= mini_name
.arg( GetIndex() + 1 );
202 QString remark
= "Check stem/suffix cut";
203 GetDLHistory()->append( mini_name
, remark
, this );