1 // Analyzing words using discovered signatures
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
10 #include "Signature.h"
12 #include "SignatureCollection.h"
13 #include "WordCollection.h"
14 #include "StringSurrogate.h"
18 // We accept any stem if it can match a good signature
20 void CMiniLexicon::TakeSignaturesFindStems(CSignatureCollection
* Sigs
)
22 CLexicon
& lex
= *m_pLexicon
;
23 linguistica::ui::status_user_agent
& status
= lex
.status_display();
26 const int SizeThreshold
= lex
.GetIntParameter(
27 "TakeSignaturesFindStems\\SizeThreshold", 2);
28 const int StemCountThreshold
= lex
.GetIntParameter(
29 "TakeSignaturesFindStems\\StemCountThreshold", 2);
30 const int MinimumStemLength
= lex
.GetIntParameter(
31 "Main\\MinimumStemLength", 10);
33 CStringSurrogate ssAffix
;
34 CStringSurrogate ssStem
;
36 QString msg
, stem
, word
;
37 QMap
<QString
, int> ParsableWords
;
38 QStringList TempStems
;
39 QStringList NewStemsFound
;
45 status
.major_operation
=
46 QString("Mini-Lexicon %1: Take signatures to find stems")
48 status
.progress
.clear();
50 LogFileLargeTitle("Phase: Take Signatures, Find Stems");
52 const bool analyzingSuffixes
= !is_initial(GetAffixLocation());
54 for (int wordno
= 0; wordno
< (int)m_pWords
->GetCount(); wordno
++) {
55 pWord
= m_pWords
->GetAt(wordno
);
56 if (pWord
->MayBeParsed())
57 ParsableWords
.insert(pWord
->Display(), 1); // 1 is a dummy value.
60 // We loop through the good signatures and
61 // then run through the words to see
62 // if they could belong to the good signatures.
63 // We have to be careful, because a word might
64 // have belonged to a different signature and
65 // still have the marks of those suffixes
66 // in its factorization.
68 LogFileHeader("--", "Signature");
70 // Go through signatures:
73 status
.progress
.set_denominator(Sigs
->GetCount());
74 for (int signo
= 0; signo
< (int)Sigs
->GetCount(); signo
++) {
75 status
.progress
= signo
;
76 pSig
= Sigs
->GetAtSort(signo
);
79 if (pSig
->Size() < SizeThreshold
)
81 if (pSig
->GetNumberOfStems() < StemCountThreshold
)
84 LogFileSmallTitle("Empirical: " + pSig
->Display('.'));
85 status
.details
= pSig
->Display();
87 // Choose the first suffix in pSig that isn't NULL.
89 if (pSig
->GetPiece(1).IsNULL())
93 NewStemsFound
.clear();
95 ssAffix
= pSig
->GetPiece(suffixno
);
96 AffixLength
= ssAffix
.GetLength();
97 QMapIterator
<QString
, int> iter(ParsableWords
);
99 while (iter
.hasNext()) {
100 word
= iter
.next().key();
101 if (analyzingSuffixes
) {
102 if (word
.endsWith(ssAffix
.Display())) {
103 if (word
.length() == AffixLength
)
105 stem
= word
.left(word
.length() - AffixLength
);
106 Q_ASSERT(stem
.length() != 0);
107 if ((int) stem
.length() < MinimumStemLength
)
109 // put into Temp Stems
110 // all those stems from words which might
111 // be analyzed as ending in ssAffix.
112 TempStems
.append(stem
);
114 } else { // analyzing prefixes
115 if (word
.startsWith(ssAffix
.Display())) {
116 if (word
.length() == AffixLength
)
118 stem
= word
.right(word
.length() - AffixLength
);
119 Q_ASSERT(stem
.length() != 0);
120 if ((int) stem
.length() < MinimumStemLength
)
122 TempStems
.append(stem
);
125 } // end of loop on words
129 const int numberofcolumns
= 8;
130 for (int stemno
= 0; stemno
< TempStems
.count(); stemno
++) {
132 stem
= TempStems
.at(stemno
);
133 // LogFileSimpleString(stem);
134 for (int affixno
=1; affixno
<= pSig
->Size(); affixno
++) {
135 ssAffix
= pSig
->GetPiece(affixno
);
136 analyzingSuffixes
? word
= stem
+ ssAffix
.Display():
137 word
= ssAffix
.Display() + stem
;
138 if (! ParsableWords
.contains(word
)) {
143 if (FailureFlag
== false) {
144 NewStemsFound
.append(stem
);
145 if (colno
== numberofcolumns
) {
151 LogFileSimpleString(stem
);
154 } // end of stemno loop
157 // Now start building up pSig again.
158 LogFileSmallTitle("Reanalyzed words");
160 for (int stemno
= 0; stemno
< NewStemsFound
.count(); stemno
++) {
161 stem
= NewStemsFound
.at(stemno
);
163 const int numberofcolumns
= 8;
164 for (int affixno
= 1; affixno
<= pSig
->Size(); affixno
++) {
165 ssAffix
= pSig
->GetPiece(affixno
);
166 if (ssAffix
.IsNULL())
169 if (analyzingSuffixes
)
170 word
= stem
+ ssAffix
.Display();
172 word
= ssAffix
.Display() + stem
;
174 pWord
= *m_pWords
^= CStringSurrogate(word
);
178 if (pWord
&& analyzingSuffixes
) {
179 pWord
->ClearRootSuffixSplit();
180 pWord
->CutRightBeforeHere(stem
.length());
181 pWord
->SetStemLoc(1);
182 pWord
->SetSuffixLoc(2);
183 m_pLexicon
->UpdateWord(pWord
);
185 pWord
->ClearPrefixStemSplit();
186 pWord
->CutRightBeforeHere(ssAffix
.GetLength());
187 pWord
->SetStemLoc(2);
188 pWord
->SetPrefixLoc(1);
189 m_pLexicon
->UpdateWord(pWord
);
192 if (pWord
->GetConfidence().length() == 0) {
193 msg
= "3: From sig find stem";
194 pWord
->AppendToConfidence(msg
);
196 if (colno
== numberofcolumns
) {
202 LogFileSimpleString(pWord
->Display());
204 } // end of loop on affixno
205 if (pSig
->Size() > 0)
207 }// cycle through this set of Stems
209 }// end of loop on signo
210 status
.progress
.clear();
211 status
.details
.clear();
215 QString
mini_name("Mini-Lexicon %1");
216 msg
= "From sigs find stems";
217 CStringSurrogate ssRemark
= msg
;
219 // Writes to status.details instead of status.major_operation.
220 TakeSplitWords_ProduceStemsAndSigs(ssRemark
);
222 status
.major_operation
.clear();
223 mini_name
= mini_name
.arg(GetIndex() + 1);
224 QString remark
= "From sigs: find stems";
225 GetDLHistory()->append(mini_name
, remark
, this);