CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / Signature.cpp
blob4d7d57b5def69a2b32f3b2eed1b92e038395d024
1 // Implementation of CSignature, CSignatureListViewItem methods
2 // Copyright © 2009 The University of Chicago
3 #include "Signature.h"
4 #include <QMessageBox>
5 #include <QTextStream>
6 #include <QList>
7 #include "linguisticamainwindow.h"
8 #include <QPair>
9 #include "MiniLexicon.h"
10 #include "LPreferences.h"
11 #include "CorpusWord.h"
12 #include "Suffix.h"
13 #include "Prefix.h"
14 #include "Stem.h"
15 #include "SignatureCollection.h"
16 #include "SuffixCollection.h"
17 #include "PrefixCollection.h"
18 #include "WordCollection.h"
19 #include "StemCollection.h"
20 #include "SparseIntVector.h"
21 #include "CompareFunc.h"
22 #include "HTML.h"
23 #include "log2.h"
24 #include "Typedefs.h"
25 #include "implicit_cast.h"
27 bool stemlessthan(const QPair<CStem*, int> pair1, const QPair<CStem*, int> pair2 );
29 bool stemlessthan(const QPair<CStem*, int> pair1, const QPair<CStem*, int> pair2 )
31 return pair2.second < pair1.second;
35 //===================================================================================================//
37 // Signature listview item
39 //===================================================================================================//
40 CSignatureListViewItem::CSignatureListViewItem(Q3ListView *parent,
41 QString signature, int mini, CSignature* pSig,
42 QMap<QString, QString>* filter)
43 : Q3ListViewItem( parent, signature )
45 m_signature = pSig;
46 m_filter = filter;
47 m_label = signature;
48 m_parentlist = parent;
49 m_mini = mini;
53 CSignatureListViewItem::CSignatureListViewItem(Q3ListViewItem *parent,
54 QString signature, int mini, CSignature* pSig,
55 QMap<QString, QString>* filter)
56 : Q3ListViewItem( parent, signature )
58 m_signature = pSig;
59 m_filter = filter;
60 m_label = signature;
61 m_parentlist = parent->listView();
62 m_mini = mini;
65 int CSignatureListViewItem::compare(Q3ListViewItem *item, int col, bool asc) const
68 if (col== 2)
70 return MakeComparable ( m_signature->ComputeDLofModel() , ((CSignatureListViewItem*) item)->GetSignature()->ComputeDLofModel() );
72 if (col== 3)
74 return MakeComparable ( m_signature->GetCorpusCount() , ((CSignatureListViewItem*) item)->GetSignature()->GetCorpusCount() );
76 if (col== 4)
78 return MakeComparable ( m_signature->GetNumberOfStems() , ((CSignatureListViewItem*) item)->GetSignature()->GetNumberOfStems() );
80 if (col== 6)
82 return MakeComparable ( ((CSignatureListViewItem*) item)->GetSignature()->GetRobustness(), m_signature->GetRobustness() );
84 else
86 return Q3ListViewItem::compare(item, col, asc);
91 QString CSignatureListViewItem::text( int column ) const
95 CSignatureListViewItem* child = NULL;
97 int count;
98 QString dummy;
101 switch( column )
103 case 0:
104 if( m_signature && m_parentlist->sortColumn() == 0 && m_signature->GetMentor() )
106 return " : " + m_label;
108 else return m_label;
109 case 1:
110 if( m_signature && m_signature->GetNumberOfStems() > 0 )
112 if (m_signature->GetNumberOfStems() > 0 ) return m_signature->GetStem(0)->Display( QChar(0), m_filter );
114 else return "";
115 case 2:
116 if( m_signature ) return dummy.setNum( m_signature->ComputeDLofModel() );
117 else return "";
118 case 3:
119 if( m_signature ) return dummy.setNum ( m_signature->GetCorpusCount() );
120 else
122 count = 0;
123 child = (CSignatureListViewItem*) firstChild();
124 while( child )
126 if( child->GetSignature() )
128 count += child->GetSignature()->GetCorpusCount();
130 child = (CSignatureListViewItem*) child->nextSibling();
132 return dummy.setNum( count );
134 case 4:
135 if( m_signature && m_signature->GetNumberOfStems() > 0 ) return dummy.setNum( m_signature->GetNumberOfStems() );
136 else
138 count = 0;
139 child = (CSignatureListViewItem*) firstChild();
140 while( child )
142 if( child->GetSignature() &&
143 child->GetSignature()->GetNumberOfStems() > 0 )
145 count += child->GetSignature()->GetNumberOfStems();
147 child = (CSignatureListViewItem*) child->nextSibling();
149 return dummy.setNum( count );
151 case 5:
152 if( m_signature ) return m_signature->GetRemark();
153 else return "";
155 case 6:
156 if( m_signature ) return dummy.setNum( (int) m_signature->GetRobustness() );
158 else
160 count = 0;
161 child = (CSignatureListViewItem*) firstChild();
162 while( child )
164 if( child->GetSignature() &&
165 child->GetSignature()->GetNumberOfStems() > 0 )
167 count += child->GetSignature()->GetNumberOfStems();
169 child = (CSignatureListViewItem*) child->nextSibling();
171 return dummy.setNum( count );
173 case 7:
174 return "";
175 default:
176 return Q3ListViewItem::text( column );
180 //===================================================================================================//
182 // GUI stuff
184 //===================================================================================================//
185 void CSignature::BorrowedSigsDisplay(Q3ListView* List,
186 QMap<QString, QString>* filter)
188 QString source = "Unknown", dummy;
189 for (int minino = 0; minino < m_pMyMini->GetMiniSize(); ++minino) {
190 CMiniLexicon* mini = m_pMyMini->GetMiniLexicon(minino);
191 if (mini == 0)
192 continue;
194 CSignatureCollection& sigs = *mini->GetSignatures();
195 if (sigs ^= this) {
196 // found!
197 source = dummy.setNum(minino + 1);
198 break;
202 static_cast<void>(new Q3ListViewItem(
203 List, Display('.', filter), source));
206 //===================================================================================================//
208 // Constructor/destructor
210 //===================================================================================================//
212 CSignature::CSignature( CMiniLexicon* Lexicon ) : CLParse( Lexicon )
214 m_pMyMini = Lexicon;
216 m_StemPtrList = new QList<CStem*>();
217 m_WordPtrList = new QList<CStem*>();
218 m_MentorList = new QList<CSignature*>();
219 m_SuffixPtrList = new QList<CSuffix*>();
220 m_PrefixPtrList = new QList<CPrefix*>();
221 m_SortStyle = eAlphabetized;
222 // Description Length
223 m_DLofMyCorpus = 0;
224 m_DLofMyStemPointers = 0;
225 m_DLofMyAffixPointers = 0;
226 m_LengthOfPointerToMe = 0;
227 m_MyGeneralizer = NULL;
228 m_Remark = "";
229 m_Robustness = 0;
230 m_Mentor = NULL;
231 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
235 CSignature::CSignature( eAffixLocation AffixLocation, CMiniLexicon* Lexicon ) : CLParse( Lexicon )
237 m_pMyMini = Lexicon;
238 m_StemPtrList = new QList<CStem*>();
239 m_WordPtrList = new QList<CStem*>();
240 m_MentorList = new QList<CSignature*>();
241 m_SuffixPtrList = new QList<CSuffix*>();
242 m_PrefixPtrList = new QList<CPrefix*>();
243 m_SortStyle = eAlphabetized;
244 m_MyGeneralizer = NULL;
245 m_AffixLocation = AffixLocation;
247 m_Remark = "";
248 // Description Length
249 m_DLofMyCorpus = 0;
250 m_DLofMyStemPointers = 0;
251 m_DLofMyAffixPointers = 0;
252 m_LengthOfPointerToMe = 0;
253 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
257 CSignature::CSignature (const CParse& ParseSig, CMiniLexicon* Lexicon) : CLParse ( ParseSig, Lexicon )
259 m_pMyMini = Lexicon;
260 m_AffixLocation = Lexicon->GetAffixLocation();
261 m_StemPtrList = new QList<CStem*>();
262 m_WordPtrList = new QList<CStem*>();
263 m_MentorList = new QList<CSignature*>();
264 m_SuffixPtrList = new QList<CSuffix*>();
265 m_PrefixPtrList = new QList<CPrefix*>();
266 m_SortStyle = eAlphabetized;
267 m_Remark = "";
268 m_MyGeneralizer = NULL;
269 // Description Length
270 m_DLofMyCorpus = 0;
271 m_DLofMyStemPointers = 0;
272 m_DLofMyAffixPointers = 0;
273 m_LengthOfPointerToMe = 0;
274 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
278 CSignature::CSignature (const CParse* pParseSig, CMiniLexicon* Lexicon) : CLParse ( *pParseSig, Lexicon )
280 m_pMyMini = Lexicon;
282 m_StemPtrList = new QList<CStem*>();
283 m_WordPtrList = new QList<CStem*>();
284 m_MentorList = new QList<CSignature*>();
285 m_SuffixPtrList = new QList<CSuffix*>();
286 m_PrefixPtrList = new QList<CPrefix*>();
287 m_SortStyle = eAlphabetized;
288 m_MyGeneralizer = NULL;
289 m_AffixLocation = Lexicon->GetAffixLocation();
290 m_Remark = "";
291 // Description Length
292 m_DLofMyCorpus = 0;
293 m_DLofMyStemPointers = 0;
294 m_DLofMyAffixPointers = 0;
295 m_LengthOfPointerToMe = 0;
296 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
300 CSignature::CSignature(const CSignature& Sig) : CLParse (Sig, Sig.GetLexicon())
302 int affixno,
303 stemno;
304 m_AffixLocation = Sig.GetAffixLocation();
305 m_Remark = Sig.GetRemark();
306 m_pMyMini = Sig.GetLexicon();
307 m_MyGeneralizer = Sig.GetGeneralizer();
309 int NumberOfStems = Sig.GetNumberOfStems();
310 int NumberOfAffixes = Sig.Size();
311 int NumberOfWords = NumberOfStems*NumberOfAffixes;
312 QVector<double> m_WordCounts (NumberOfAffixes * NumberOfStems );
313 QVector<double> m_StemCounts ( NumberOfStems );
314 QVector<double> m_AffixCounts( NumberOfAffixes );
315 QVector<double> m_WordFrequencies (NumberOfWords);
316 QVector<double> m_StemFrequencies (NumberOfStems);
317 QVector<double> m_AffixFrequencies (NumberOfAffixes);
318 m_TotalCount = Sig.GetTotalCount();
320 m_StemPtrList = new QList<CStem*>();
321 for ( stemno = 0; stemno < NumberOfStems; stemno++)
323 AppendStemPtr( Sig.GetStem(stemno));
324 m_StemCounts[stemno] = Sig.GetStemCount(stemno);
325 m_StemFrequencies[stemno] = Sig.GetStemFrequency(stemno);
327 if (m_AffixLocation == WORD_FINAL || m_AffixLocation == STEM_FINAL) {
328 m_SuffixPtrList = new QList<CSuffix*>();
329 for ( affixno = 0; affixno < NumberOfAffixes; affixno++)
331 AppendSuffixPtr ( Sig.GetSuffix(affixno) );
332 m_AffixCounts[affixno] = Sig.GetAffixCount(affixno);
333 m_AffixFrequencies[affixno] = Sig.GetAffixFrequency(affixno);
336 if (m_AffixLocation == WORD_INITIAL || m_AffixLocation == STEM_INITIAL) {
337 m_PrefixPtrList = new QList<CPrefix*>();
338 for ( affixno = 0; affixno < NumberOfAffixes; affixno++)
340 AppendPrefixPtr ( Sig.GetPrefix(affixno) );
341 m_AffixCounts[affixno] = Sig.GetAffixCount(affixno);
342 m_AffixFrequencies[affixno] = Sig.GetAffixFrequency(affixno);
347 m_WordPtrList = new QList<CStem*>();
348 for (stemno = 0; stemno < NumberOfStems ; stemno++) {
349 for (affixno = 0; affixno < NumberOfAffixes; affixno++) {
350 SetWordCount(stemno, affixno, 0);
351 AppendWordPointer (Sig.GetWord(stemno, affixno));
355 m_Robustness = Sig.GetRobustness();
356 m_Mentor = NULL;
357 m_SortStyle = eAlphabetized;
358 m_MentorList = new QList<CSignature*>();
367 CSignature::CSignature(const CStringSurrogate& ssSig, CMiniLexicon* Lexicon) : CLParse(ssSig, Lexicon)
369 Collapse( ssSig, '.');
370 m_pMyMini = Lexicon;
372 m_StemPtrList = new QList<CStem*>();
373 m_WordPtrList = new QList<CStem*>();
374 m_MentorList = new QList<CSignature*>();
375 m_SuffixPtrList = new QList<CSuffix*>();
376 m_PrefixPtrList = new QList<CPrefix*>();
377 m_SortStyle = eAlphabetized;
378 m_MyGeneralizer = NULL;
379 // Description Length
380 m_DLofMyCorpus = 0;
381 m_DLofMyStemPointers = 0;
382 m_DLofMyAffixPointers = 0;
383 m_LengthOfPointerToMe = 0;
384 m_Remark = "";
385 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
386 m_Robustness = 0;
387 m_Mentor = NULL;
389 m_SortStyle = eAlphabetized;
393 CSignature::~CSignature()
396 if( m_StemPtrList ) delete m_StemPtrList;
397 if( m_WordPtrList ) delete m_WordPtrList;
398 if( m_MentorList ) delete m_MentorList;
399 if( m_SuffixPtrList ) delete m_SuffixPtrList;
400 if( m_PrefixPtrList ) delete m_PrefixPtrList ;
402 //===================================================================================================//
404 // Display
406 //===================================================================================================//
407 QString CSignature::Display(QChar sep, QMap<QString, QString>* filter) const
409 QString sd = sep;
410 if (sd == ".") {
411 sd = m_pMyMini->GetDocument()->GetPreferences()
412 ->GetPreference("Sig_Delimiter");
413 if (sd.size() != 1)
414 sd = ".";
416 return CParse::Display(sd.at(0), filter);
419 QString CSignature::Display(QMap<QString, QString>* filter) const
420 { return CParse::Display(filter); }
422 QString CSignature::Display() const
423 { return CParse::Display('.'); }
425 //===================================================================================================//
429 //===================================================================================================//
431 void CSignature::ConsumeParse( CParse* pParse )
433 ClearParse();
434 SetKey( pParse );
435 CopyParseStructure( *pParse );
439 void CSignature::Suicide()
441 //TODO: fill this in;
443 void CSignature::SetMyGeneralizer (CSignature* pSig)
445 m_MyGeneralizer = pSig;
447 //===================================================================================================//
449 // Operators
451 //===================================================================================================//
452 void CSignature::operator=(const CSignature* pSig)
454 m_pMyMini = pSig->GetMyMini();
455 CLParse::operator=(*pSig);
456 m_AffixLocation = pSig->GetAffixLocation();
458 int NumberOfStems = pSig->GetNumberOfStems();
459 int NumberOfAffixes = pSig->GetNumberOfAffixes();
460 int NumberOfWords = NumberOfStems*NumberOfAffixes;
461 m_StemCounts.resize(NumberOfStems);
462 m_WordCounts.resize(NumberOfWords);
463 m_AffixCounts.resize(NumberOfAffixes);
465 m_StemCounts.resize(NumberOfStems);
466 for (int stemno = 0; stemno < pSig->GetNumberOfStems(); stemno++) {
467 m_StemPtrList->append ( pSig->GetStem(stemno) );
468 m_StemCounts[stemno]=pSig->GetStemCount(stemno);
469 for (int affixno = 0; affixno < pSig->GetNumberOfAffixes(); affixno++)
471 m_WordPtrList->append ( pSig->GetWord(stemno, affixno));
472 SetWordCount(stemno, affixno, pSig->GetWordCount(stemno, affixno));
476 if (m_AffixLocation == WORD_FINAL || m_AffixLocation == STEM_FINAL ) {
477 for (int suffixno = 0; suffixno < pSig->GetNumberOfAffixes(); suffixno++)
479 m_SuffixPtrList->append ( pSig->GetSuffix(suffixno) );
480 m_AffixCounts[suffixno] = pSig->GetAffixCount(suffixno);
482 } else {
483 for (int prefixno = 0; prefixno < GetNumberOfAffixes(); prefixno++) {
484 m_PrefixPtrList->append(pSig->GetPrefix(prefixno) );
485 m_AffixCounts[prefixno] = pSig->GetAffixCount(prefixno);
491 m_Robustness = pSig->GetRobustness();
492 m_Mentor = NULL;
493 m_Remark = pSig->GetRemark();
497 QTextStream& operator<< (QTextStream& stream, CSignature* pSig)
499 CStem* pStem;
501 stream << endl << pSig->Display();
502 stream.width(6);
503 stream << pSig -> GetNumberOfStems() << " " << pSig->GetCorpusCount();
505 for (int stemno = 0; stemno < pSig->GetNumberOfStems(); stemno++)
507 pStem = pSig->GetStem(stemno);
508 if ( pStem->GetKey() != CStringSurrogate() )
510 stream << endl;
511 stream.width(20);
512 stream << pStem->GetKey().Display();
513 } else
515 stream << endl;
516 stream.width(20);
517 stream << "???";
521 return stream;
524 // <<-------------------------------------------------------------------------------------------------------->>
525 void CSignature::operator<< (CStem* pStem) //add to tail of list.
528 CStem* pWord;
530 if ( m_StemPtrList->indexOf ( pStem ) < 0 )
532 Q_ASSERT (pStem->GetKeyLength() > 0);
533 m_StemPtrList->append(pStem);
536 Q_ASSERT ( m_PieceCount <= m_LengthOfPieceVector ) ;
538 for (int wordno = 0; wordno < pStem->GetWordPtrList()->size(); wordno++)
540 pWord = pStem->GetWord(wordno);
541 Q_ASSERT (pWord->GetKeyLength() > 0);
542 m_WordPtrList->append (pWord);
544 pStem->SetSuffixSignature (this);
546 m_Robustness = 0;
547 m_Robustness = GetRobustness();
550 //===================================================================================================//
552 // Accessors and setters
554 //===================================================================================================//
555 CSignature* CSignature::GetMentor ( ) { return m_Mentor; }
556 // <<-------------------------------------------------------------------------------------------------------->>
557 void CSignature::SetMentor ( CSignature* pSig )
559 m_Mentor = pSig;
560 if( pSig && pSig->GetMentorList() && pSig->GetMentorList()->indexOf (this) < 0) {
561 pSig->GetMentorList()->append( this );
566 int CSignature::GetNumberOfAffixes() const
569 if ( m_AffixLocation == STEM_FINAL || m_AffixLocation == WORD_FINAL)
571 return m_SuffixPtrList->count();
573 if ( m_AffixLocation == STEM_INITIAL || m_AffixLocation == WORD_INITIAL)
575 return m_PrefixPtrList->count();
577 return 0;
581 void CSignature::AppendSuffixPtr (CSuffix* pSuffix) { m_SuffixPtrList->append(pSuffix);}
582 QList<CSignature*>* CSignature::GetMentorList( ) { return m_MentorList; }
583 int CSignature::GetNumberOfStems() const { return m_StemPtrList->count(); }
584 //int CSignature::GetNumberOfSuffixes () const { return m_SuffixPtrList->count(); }
585 void CSignature::SetRemark ( QString remark) { m_Remark = remark; }
586 CPrefix* CSignature::GetPrefix(int prefixno) const { return m_PrefixPtrList->at(prefixno); }
587 QList<CPrefix*>* CSignature::GetPrefixPtrList() const { return m_PrefixPtrList; }
588 QString CSignature::GetRemark() const { return m_Remark; }
589 QList<CStem*>* CSignature::GetStemPtrList() const { return m_StemPtrList;}
590 CStem* CSignature::GetStem(int stemno) const { return m_StemPtrList->at(stemno); }
591 CSuffix* CSignature::GetSuffix(int suffixno) const { return m_SuffixPtrList->at(suffixno); }
592 QList<CSuffix*>* CSignature::GetSuffixPtrList() const { return m_SuffixPtrList; }
593 int CSignature::GetTotalCount() const { return m_TotalCount; }
594 double CSignature::GetCorpusCount() const { return corpus_count::GetCorpusCount();}
595 float CSignature::GetSortingQuantity() const { return (float) GetRobustness();}
597 bool CSignature::StemListContains(CStem* pstem) { return m_StemPtrList->contains(pstem); }
598 void CSignature::AppendStemPtr(CStem* pStem) const { m_StemPtrList->append(pStem);}
601 eAffixLocation CSignature::GetAffixLocation() const { return m_AffixLocation; }
602 // <<-------------------------------------------------------------------------------------------------------->>
603 CStem* CSignature::GetWord(int stemno, int affixno) const
605 if (stemno < 0 || affixno < 0 || stemno >= GetNumberOfStems() || affixno >= GetNumberOfAffixes())
606 return NULL;
607 if (stemno * GetNumberOfAffixes() + affixno >= m_WordPtrList->size() )
608 return NULL;
609 return m_WordPtrList->at(stemno* GetNumberOfAffixes() + affixno);
611 CParse CSignature::GetStems()
613 CParse List;
614 List.Alphabetize();
615 if ( m_StemPtrList->count() == 0 ) { return List; } // ********** This is clearly a mistake. Fix it.
617 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
619 List.Append( GetStem(stemno)->GetKey() );
621 return List;
623 // <<-------------------------------------------------------------------------------------------------------->>
632 // <<-------------------------------------------------------------------------------------------------------->>
633 double CSignature::GetStemFrequency(int stemno ) const {
634 if (stemno < 0 || stemno > GetNumberOfStems() ) return 0;
635 return m_StemFrequencies[stemno];
638 // <<-------------------------------------------------------------------------------------------------------->>
639 double CSignature::GetAffixFrequency(int affixno ) const {
640 if (affixno < 0 || affixno > GetNumberOfAffixes() ) {return 0; }
641 return m_AffixFrequencies[affixno];
643 // <<-------------------------------------------------------------------------------------------------------->>
644 double CSignature::GetStemCount(int stemno) const {
645 if (stemno < 0 || stemno > GetNumberOfStems() ){ return 0; }
646 return m_StemCounts[stemno];
648 // <<-------------------------------------------------------------------------------------------------------->>
650 double CSignature::GetAffixCount(int affixno) const
651 { if (affixno < 0 || affixno > GetNumberOfAffixes() ) return 0;
652 return m_AffixCounts[affixno];
654 // <<-------------------------------------------------------------------------------------------------------->>
655 double CSignature::GetWordCount(int wordno)const {
656 if (wordno < 0 || wordno > GetNumberOfWords() ) { return 0;}
657 return m_WordCounts[wordno]; }
658 // <<-------------------------------------------------------------------------------------------------------->>
660 //===================================================================================================//
662 // Calculate frequencies and counts
664 //===================================================================================================//
665 void CSignature::CalculateFrequencies(CMiniLexicon* Lexicon)
667 CStringSurrogate Suffix;
668 CSuffix* pSuffix;
669 CStem* pStem;
670 CCorpusWord* pCorpusWord;
671 Q_ASSERT( GetCorpusCount() > 0);
672 int TotalCorpusCount = 0;
673 int* SuffixCount = new int [ Size()+ 1 ];
674 for (int suffixno = 1; suffixno <= Size(); ++suffixno)
675 { SuffixCount[suffixno] = 0; }
677 for (int suffixno = 1; suffixno <= Size(); suffixno++)
679 Suffix = GetPiece(suffixno);
680 pSuffix = new CSuffix(Suffix);
682 for (int stemno= 0; stemno < GetNumberOfStems(); stemno++)
684 pStem = GetStem(stemno);
685 pCorpusWord = Lexicon->FindAWord (pStem, pSuffix);
686 if( pCorpusWord ) // might not exist if we have collapsed signatures.
688 TotalCorpusCount += pCorpusWord->GetCorpusCount();
689 SuffixCount[suffixno] += pCorpusWord->GetCorpusCount();
694 delete [] SuffixCount;
697 // <<-------------------------------------------------------------------------------------------------------->>
698 void CSignature::ListDisplay(Q3ListView* List,
699 QMap<QString, QString>* filter, bool ExpressDeletees)
701 CSignature sig(m_pMyMini);
702 Express(sig, ExpressDeletees);
703 QString text = sig.Display('.', filter);
705 static_cast<void>(new CSignatureListViewItem(
706 List, text, m_pMyMini->GetIndex(), this, filter));
709 // <<-------------------------------------------------------------------------------------------------------->>
710 void CSignature::FindCorpusCount( )
712 SetCorpusCount ( 0 );
713 for (int stemno =0; stemno < GetNumberOfStems(); stemno++) {
714 for (int affixno = 0; affixno < GetNumberOfAffixes(); affixno ++)
715 IncrementCorpusCount ( GetWord(stemno, affixno)->GetCorpusCount() );
718 // <<-------------------------------------------------------------------------------------------------------->>
719 void CSignature::AttachToSuffixSig(CStem* pStem, bool bLookAtPreviousSig) //add to tail of list.
721 int stemno;
722 int numberofaffixes = GetNumberOfAffixes();
723 CStem* pWord;
724 CSignature* pOldSig = pStem->GetSuffixSignature();
725 QString stem = pStem->Display();
727 /* First, remove pStem from any other SuffixSignature it might be linked to.*/
728 if ( pOldSig && pOldSig != this ) {
729 pOldSig->DetachStem( pStem, eDo_Not_Call_Words );
730 pOldSig->RecalculateStemAndWordPointers();
733 stemno = m_StemPtrList->indexOf ( pStem );
734 if( stemno < 0 ) {
735 m_StemPtrList->append( pStem );
736 stemno = GetNumberOfStems()-1;
739 switch( m_AffixLocation){
740 case (WORD_FINAL):
741 case (STEM_FINAL):
742 for (int affixno = 0; affixno < numberofaffixes; affixno++)
744 pWord = GetLexicon()->GetWordFromStemSuffix(pStem, GetSuffix(affixno));
745 if (pWord)
747 AppendWordPointer( pWord);
748 pWord->SetSuffixSignature (this);
750 else
752 AppendWordPointer(NULL);
755 break;
756 case (WORD_INITIAL):
757 case (STEM_INITIAL):
758 for (int prefixno = 0; prefixno < numberofaffixes; prefixno++)
760 pWord = GetLexicon()->GetWordFromStemPrefix(pStem, GetPrefix(prefixno));
761 if (pWord)
763 AppendWordPointer( pWord);
764 pWord->SetPrefixSignature (this);
766 else
768 AppendWordPointer(NULL);
771 break;
774 pStem->SetSuffixSignature( this );
775 IncrementCorpusCount( pStem->GetCorpusCount()-1 );// first time CC is incremented
777 m_Robustness = 0;
778 m_Robustness = GetRobustness();
780 // <<-------------------------------------------------------------------------------------------------------->>
781 void CSignature::AttachToPrefixSig( CStem* pStem, bool bLookAtPreviousSig ) //add to tail of list.
783 CStem* pWord;
784 CSignature* pOldSig = pStem->GetPrefixSignature();
786 /* First, remove pStem from any other PrefixSignature it might be linked to.*/
787 if ( pOldSig && pOldSig != this ) {
788 pOldSig->DetachStem( pStem, eDo_Not_Call_Words );
789 RecalculateStemAndWordPointers();
792 if( m_StemPtrList->indexOf ( pStem ) < 0 ) {
793 AppendStemPtr( pStem );
796 // move the Words from the old signature to this, the new one.
798 for (int wordno = 0; wordno < pStem->GetNumberOfWords(); wordno++) {
799 pWord = pStem->GetWord(wordno);
800 m_WordPtrList->append (pWord);
801 pWord->SetPrefixSignature (this);
806 pStem->SetPrefixSignature( this );
807 IncrementCorpusCount( pStem->GetCorpusCount()-1 );
808 m_Robustness = GetRobustness();
811 // <<-------------------------------------------------------------------------------------------------------->>
812 double CSignature::GetRobustness() const
814 int SuffixLetters = 0,
815 StemLetters = 0;
817 if (m_Robustness == 0)
819 SuffixLetters = GetKeyLength();
820 QString Null = "NULL";
821 if ( Contains( CStringSurrogate(Null.unicode(),0,Null.length()) ) ) { SuffixLetters -= 4; }
823 CStem* pStem;
824 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++) {
825 pStem = GetStem(stemno);
826 StemLetters += pStem->GetKeyLength();
829 m_Robustness = ( Size() - 1 ) * StemLetters + (GetNumberOfStems() - 1) * SuffixLetters;
832 return m_Robustness;
834 // <<-------------------------------------------------------------------------------------------------------->>
835 void CSignature::SetRobustness ( double R ) { m_Robustness = R; }
836 // <<-------------------------------------------------------------------------------------------------------->>
838 // the counts of each individual word analyzed by this signature.
839 //double* CSignature::GetWordCounts() const { return m_WordCounts;
841 // <<-------------------------------------------------------------------------------------------------------->>
842 double CSignature::GetWordCount(int stemno, int affixno) const
844 if ( stemno < 0 || affixno < 0 || stemno >= GetNumberOfStems() || affixno >= GetNumberOfAffixes() ) return 0;
845 return m_WordCounts[stemno * GetNumberOfStems() + affixno];
847 // <<-------------------------------------------------------------------------------------------------------->>
848 void CSignature::SetWordCount (int stemno, int affixno, double value)
850 if ( stemno < 0 || affixno < 0 || stemno >= GetNumberOfStems() || affixno >= GetNumberOfAffixes() )
851 return;
852 m_WordCounts[stemno * GetNumberOfAffixes() + affixno] = value;
853 return;
857 // <<-------------------------------------------------------------------------------------------------------->>
859 void CSignature::CalculateWordCounts()
860 { QString string;
862 int numberofstems = GetNumberOfStems();
863 int numberofaffixes = GetNumberOfAffixes();
864 int count = 0;
866 CStem* pWord;
868 m_WordCounts.clear();
869 m_WordCounts.resize(numberofstems*numberofaffixes);
870 m_StemCounts.clear();
871 m_StemCounts.resize(numberofstems);
872 m_AffixCounts.clear();
873 m_AffixCounts.resize(numberofaffixes);
874 m_TotalCount = 0;
875 for (int affixno = 0; affixno < numberofaffixes; affixno++) { m_AffixCounts[affixno] = 0; }
876 for (int stemno = 0; stemno < numberofstems; stemno++) { m_StemCounts[stemno] = 0; }
880 for (int stemno = 0; stemno < numberofstems; stemno++)
882 for ( int affixno = 0; affixno < numberofaffixes; affixno++)
884 pWord = GetWord(stemno, affixno);
885 count = pWord->GetCorpusCount();
886 // SetWordCount (stemno, affixno, count);
887 // m_StemCounts[stemno] = m_StemCounts[stemno] + count;
888 // m_AffixCounts[affixno] = m_AffixCounts[affixno] + count;
889 // m_TotalCount += count;
893 if (m_TotalCount <= 0) return;
895 m_WordFrequencies.resize(numberofstems*numberofaffixes);
896 m_StemFrequencies.resize(numberofstems);
897 m_AffixFrequencies.resize(numberofaffixes);
899 for ( int stemno = 0; stemno < numberofstems; stemno++)
901 m_StemFrequencies[stemno] = m_StemCounts[stemno]/m_TotalCount;
902 for ( affixno = 0; affixno < numberofaffixes; affixno++)
904 wordno = stemno * numberofaffixes + affixno;
905 m_WordFrequencies[wordno] = GetWordCount(stemno, affixno) / m_TotalCount;
909 for (int affixno = 0; affixno < numberofaffixes; affixno++){
910 m_AffixFrequencies[affixno] = m_AffixCounts[affixno] / m_TotalCount;
916 //=================================================================================================/
918 // TODO: make sure COST function is consistent with older versions and working right
919 double CSignature::FindCost(CMiniLexicon* Lexicon)
921 //=================================================================================================/
925 Cost of a sig =
927 Sum over all of its stems :
929 log ( CorpusSize / Stem-count ) ( cost )
930 length ( stem ) * cost of a letter ( savings )
932 Sum over all of its suffixes:
934 log ( CorpusSize / suffix-count ) ( cost )
935 length ( suffix ) * cost of a letter ( savings )
938 CStem* pStem;
939 double Cost = 0,
940 AffixCost = 0,
941 AffixSavings = 0,
942 SignatureCost = 0,
943 StemCost = 0,
944 StemSavings = 0,
945 CostOfALetter = base2log (26),
946 ThisAffixCost = 0,
947 NumberOfWords = Lexicon->GetWords()->GetCount();
948 CAffix* pAffix;
951 for (int affixno = 1; affixno <= Size(); affixno++)
953 if( m_AffixLocation == WORD_FINAL || m_AffixLocation == STEM_FINAL )
955 pAffix = *Lexicon->GetSuffixes() ^= GetPiece(affixno);
957 else
959 pAffix = *Lexicon->GetPrefixes() ^= GetPiece(affixno);
962 if ( pAffix ) // it already exists
964 ThisAffixCost = base2log ( NumberOfWords / pAffix->GetUseCount() );
966 else
968 ThisAffixCost = base2log ( NumberOfWords/GetNumberOfStems() );
969 ThisAffixCost += GetPiece(affixno).GetLength() * CostOfALetter;
971 AffixCost += ThisAffixCost;
973 AffixSavings += GetPiece(affixno).GetLength() * CostOfALetter;
975 SignatureCost += ThisAffixCost;
979 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
981 pStem = m_StemPtrList->at(stemno);
982 StemCost += base2log ( NumberOfWords / Size() ); // Size is the number of words that use stem, of course.
983 StemCost += pStem->GetKeyLength() * CostOfALetter;
984 StemSavings += pStem->GetKeyLength() * CostOfALetter * Size(); // save for each time stem appears, with each suffix
985 SignatureCost += StemCost;
988 Cost = AffixCost + StemCost - AffixSavings - StemSavings + SignatureCost;
990 return Cost;
994 // <<-------------------------------------------------------------------------------------------------------->>
996 void CSignature::OutputSignature( QTextStream& outf )
1000 QString string;
1001 CStem* pStem;
1004 outf << " ------------------------------------------------------------------------------------------ " << endl;
1005 outf << Display( '.', m_pMyMini->GetOutFilter() );
1006 outf << endl << " ------------------------------------------------------------------------------------------ " << endl;
1008 outf << endl;
1009 outf << " ";
1013 outf << "Number of stems: ";
1014 outf << QString("%1").arg( (int) GetNumberOfStems() );
1016 outf << " Corpus count: ";
1017 outf << QString("%1").arg( GetCorpusCount() );
1018 outf << " ";
1020 outf << " ";
1021 outf << GetRemark().replace( QChar(' '), "_" );
1022 outf << " ";
1024 outf << "Number of affixes: ";
1025 outf << GetNumberOfAffixes();
1026 outf << " Word Pointer List length: ";
1027 outf << m_WordPtrList->count();
1028 outf << endl;
1031 QStringList stems;
1033 CalculateWordCounts();
1034 int maxlength = 0;
1035 CStem* pWord;
1037 outf.setFieldAlignment( QTextStream::AlignLeft );
1038 QList< QPair<CStem*, int> > pstems;
1039 for (int stemno =0; stemno< GetNumberOfStems(); stemno++ )
1041 pStem = GetStem(stemno);
1042 pstems.append( qMakePair(pStem, pStem->GetCorpusCount() ) );
1043 if (pStem->GetKeyLength() > maxlength) { maxlength = pStem->GetKeyLength();}
1045 qSort(pstems.begin(), pstems.end(), stemlessthan);
1047 outf << "Sorted by stem frequency: " << endl << endl;
1048 outf << "# Rank | Stem | Words .... " << endl;
1049 outf << "# ------------------------------------------------------------------------------------------ " << endl;
1052 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++)
1054 outf.setf(2);
1055 outf.width(5);
1056 pStem = pstems[stemno].first;
1057 outf. width(6);
1058 outf << stemno;
1059 outf. width( maxlength + 5);
1060 outf << pStem->Display();
1061 outf.width (9);
1062 outf << pstems[stemno].second;
1063 outf << endl;
1065 outf << endl << "# ------------------------------------------------------------------------------------------ " << endl;
1066 outf << endl << endl <<"Display all words with counts: " << endl;
1067 outf << "# ------------------------------------------------------------------------------------------ " << endl;
1069 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++)
1071 for ( int affixno = 0; affixno < GetNumberOfAffixes(); affixno++)
1073 pWord = GetWord(stemno, affixno );
1074 if (pWord)
1076 outf.setFieldWidth (maxlength + 5); outf << pWord->Display();
1077 outf.setFieldWidth (5) ; outf << string.setNum( pWord->GetCorpusCount() );
1080 outf << endl;
1082 outf << endl << endl;
1088 /* This purpose of this function is to take a signature of the form A.SUFFIX
1089 and make it NULL.SUFFIX (the pAlternateSig), and move that letter A back onto its stems.
1092 // <<-------------------------------------------------------------------------------------------------------->>
1093 void CSignature::RemoveLetter (CStringSurrogate& ssLetter, CMiniLexicon* Lexicon, CSignature* pAlternateSig)
1096 CStem* qStem;
1097 CSuffix* pSuffix,
1098 *pNewSuffix = NULL;
1099 QString Stem,
1100 Suffix,
1101 Null = "NULL";
1102 QString OldKey = Display();
1103 CStringSurrogate ssSuffix,
1104 ssStem;
1105 CStem* pWord;
1106 CSignature NewSig ( WORD_FINAL, Lexicon );
1107 int LetterLength = ssLetter.GetLength();
1109 CSignature *qSig = NULL,
1110 *pOlderSig = NULL;
1111 CParse PSuffix,
1112 PWord,
1113 PNewStem;
1115 QMap<QString,CSuffix*> SuffixPtrTranslation;
1117 /* Create the NewSig */
1118 for (int affixno = 1; affixno <= Size(); affixno++)
1120 ssSuffix = GetPiece(affixno);
1121 if(!NewSig.GetSortStyle()== eAlphabetized) NewSig.Alphabetize();
1122 if ( ssSuffix == ssLetter )
1124 if(!NewSig.GetSortStyle()==eAlphabetized) NewSig.Alphabetize();
1125 NewSig.Append ( CStringSurrogate(Null.unicode(),0,Null.length()) );
1127 else
1129 QString lt_brak = "<", rt_brak = ">";
1131 PSuffix = CStringSurrogate(lt_brak.unicode(),0,1);
1132 PSuffix += ssLetter;
1133 PSuffix += CStringSurrogate(rt_brak.unicode(),0,1);
1134 PSuffix += ssSuffix;
1136 pSuffix = *Lexicon->GetSuffixes() << PSuffix;
1138 Suffix = "<" + ssLetter.Display() + ">" + ssSuffix.Display();
1139 SuffixPtrTranslation[ ssSuffix.Display() ] = pSuffix; // based on old suffix
1140 // SuffixStringTranslation[ ssSuffix.Display() ] = Suffix;
1142 NewSig.Append ( PSuffix.GetKey() );
1146 /* Change the KEY of this signature */
1148 SetKey ( NewSig );
1149 QString remark = GetRemark() + " +allomorphy";
1150 SetRemark ( remark );
1152 //-----------------------------------------------------------//
1153 // Change the signature, the stems, the words -- and the suffixes.
1154 //-----------------------------------//
1155 /* Deal with the stems */
1156 //-----------------------------------//
1159 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
1161 CStem* pStem = m_StemPtrList->at(stemno);
1162 ssStem = pStem->GetKey();
1163 PNewStem = ssStem + ssLetter;
1164 qStem = *Lexicon->GetStems() ^= PNewStem;
1166 if (qStem) // -- if the larger one already existed
1168 pOlderSig = *Lexicon->GetSignatures() ^= qStem->GetSuffixList();
1170 // this removes both stem and word from signature:
1171 pOlderSig -> DetachStem ( qStem, eCall_Words ); // we might want to eliminate this sig if it has no more stems
1173 qStem -> GetSuffixList()->MergeAndAlphabetizeParse( CParse(NewSig) );
1175 qSig = *Lexicon->GetSignatures() << qStem->GetSuffixList();
1177 // attaches both stems and words to qSig
1178 qSig -> AttachToSuffixSig(qStem, false);
1181 else // make the old stem into this new one
1183 pStem -> RepairSuffixList ( Lexicon );
1184 Lexicon -> GetStems()-> SetKey( pStem, PNewStem );
1185 pStem -> SetKey( PNewStem );
1189 Q_ASSERT(m_StemPtrList->size() > 0);
1190 CStem* pStem = m_StemPtrList->at(m_StemPtrList->size() - 1);
1192 //---------------------------------------------//
1193 /* Deal with the WORDs of this signature */
1194 //---------------------------------------------//
1196 for (int wordno = 0; wordno < m_WordPtrList->size(); wordno++)
1198 pWord = m_WordPtrList->at(wordno);
1199 pNewSuffix = SuffixPtrTranslation[ pWord->GetSuffix().Display() ];
1200 pWord -> ShiftStemSuffixBoundary ( LetterLength );
1202 pWord -> SetSuffixPtr ( pNewSuffix );
1203 pWord -> AttachWordAndSuffixalStem ( pStem );
1204 pWord -> SetSuffixSignature ( this );
1208 //------------------------------------------------------------//
1209 // Alternate Sig
1210 //------------------------------------------------------------//
1211 /* Shift stems from AlternateSig to the NewSig, but NOT
1212 if the stem ends with Letter; if it does, we'll
1213 keep the old signature with that stem.
1216 This will replace some or all of pAlternateSig --
1217 "some" when there are any stems that don't allow removal of the Letter.
1218 For example, NULL.ing will not disappear when <e>ing.NULL is created,
1219 because the stem "be" still requires NULL.ing --
1222 // Deal with stems in AlternateSig....
1224 for (int stemno = 0; stemno < pAlternateSig->GetNumberOfStems(); stemno++)
1226 pStem = pAlternateSig->GetStem(stemno);
1227 ssStem = pStem->GetKey();
1228 if ( ssStem.Right(LetterLength) == ssLetter )
1229 { continue; }
1231 pAlternateSig->DetachStem( pStem, eCall_Words );
1232 AttachToSuffixSig( pStem, false );
1234 // Deal with Words in Alternate signature
1236 for (int stemno = 0; stemno < pAlternateSig->GetNumberOfStems(); stemno++)
1238 pWord = pAlternateSig->GetStem(stemno);
1239 pNewSuffix = SuffixPtrTranslation[ pWord->GetSuffix().Display() ];
1241 pWord -> SetSuffixPtr ( pNewSuffix );
1242 pWord -> AttachWordAndSuffixalStem ( pStem );
1243 pWord ->SetSuffixSignature ( this );
1246 //------------------------------------------------------------//
1248 /* Get rid of the Alternate Sig ("NULL.ing" ) */
1250 if ( pAlternateSig->GetNumberOfStems() == 0 )
1252 Lexicon->GetSignatures()->Remove( pAlternateSig );
1257 // <<------------------------------------------------------------------------>>
1258 bool CSignature::EachSuffixCanHaveThisLetterPrefixedToIt ( const QString& Letter)
1260 QString Suffix;
1261 for (int affixno = 1; affixno <= Size(); ++affixno) {
1262 Suffix = GetPiece(affixno).Display();
1263 if (Suffix == "NULL" ) { Suffix = ""; }
1264 Suffix = Letter + Suffix;
1265 if(0)// TODO: if ( ! (*Lexicon->GetSuffixes() ^= Suffix ) )
1267 return FALSE;
1271 return TRUE;
1273 // <<------------------------------------------------------------------------>>
1274 void CSignature::ShiftStemSuffixCutToTheLeft(int Distance,
1275 const QString& Piece)
1277 struct not_implemented { };
1278 throw not_implemented();
1280 // XXX. suppresses “unused parameter” warnings
1281 static_cast<void>(Distance);
1282 static_cast<void>(Piece);
1284 foreach (CStem* word, *m_WordPtrList) {
1285 word->ShiftStemSuffixBoundary(-1);
1286 Q_ASSERT(word->GetStemLoc() != 0);
1289 foreach (CStem* stem, *m_StemPtrList) {
1290 CStringSurrogate stem_text = stem->GetKey();
1291 stem->ClearParse();
1292 stem->SetKey(stem_text.Left(stem_text.GetLength() - 1));
1294 // XXX. Check to see if the new stem already exists.
1295 // Lexicon->GetStems()->GetHash()->RemoveKey ( Stem );
1296 // Lexicon->GetStems()->GetHash()->SetAt( NewStem, pStem );
1297 // Lexicon->GetStems()->SetKey( pStem, NewStem );
1300 // XXX. fix the signature
1301 // AddLetter ( 1, Piece );
1303 // Lexicon->AddToScreen ( Display() );
1306 // Variant in which the shifted string varies from stem to stem.
1307 void CSignature::ShiftStemSuffixCutToTheLeft(int Distance)
1309 // XXX. suppresses “unused parameter” warning
1310 static_cast<void>(Distance);
1311 struct not_implemented { };
1312 throw not_implemented();
1314 // first, fix the words;
1315 foreach (CStem* word, *m_WordPtrList) {
1316 word->ShiftStemSuffixBoundary(-1);
1317 Q_ASSERT(word->GetStemLoc() != 0);
1320 // XXX. fix the signature
1321 // AddLetter ( 1, Piece );
1323 // Lexicon->AddToScreen ( Display() );
1326 void CSignature::AddLetter(const QString& Letter )
1328 PrefixToAllPieces ( CStringSurrogate(Letter.unicode(),0,Letter.length() ) );
1332 /// Looks at the final ngrams of the stems, and calculates its entropy
1333 double CSignature::ComputeFinalNgramEntropyOfStems(int n)
1335 TCollection<CLParse> Ngrams;
1336 foreach (CStem* pStem, *m_StemPtrList) {
1337 if (pStem->GetKeyLength() <= n)
1338 // too short
1339 return -1;
1341 CStringSurrogate ssPiece = pStem->GetKey();
1342 ssPiece = is_initial(GetAffixLocation()) ?
1343 ssPiece.Left(n) : ssPiece.Right(n);
1344 Ngrams << ssPiece;
1347 double Entropy = 0.0;
1348 const double StemCount = GetNumberOfStems();
1349 const int ngram_count = Ngrams.GetCount();
1350 for (int i = 0; i < ngram_count; ++i) {
1351 const double fraction = StemCount / Ngrams[i]->GetCorpusCount();
1352 Entropy += log2(fraction) / fraction;
1355 return Entropy;
1357 //===================================================================================================//
1359 // CHECK OUT: major function
1361 //===================================================================================================//
1362 /// Test to see whether the break with its stems is a good one.
1363 int CSignature::CheckOut(CMiniLexicon* Lexicon)
1365 using linguistica::implicit_cast;
1366 // Throughout, “DL” stands for “description length”.
1367 Lexicon->LogFileSmallTitle("Empirical signature: "+ Display() );
1368 if (Lexicon->LogFileOn()) {
1369 // dump stem list to log file.
1370 Lexicon->LogFileStartTable();
1371 Lexicon->LogFileStartRow();
1372 const int num_columns = 5;
1374 // For each stem:
1375 CParse Stems = GetStems();
1376 for (int stemno = 1; stemno <= GetNumberOfStems(); ++stemno) {
1377 if (stemno % num_columns == 0) {
1378 Lexicon->LogFileEndRow(); Lexicon->LogFileStartRow();
1380 Lexicon->LogFileSimpleString( Stems[stemno].Display()); //JG June 2010
1382 Lexicon->LogFileEndRow(); Lexicon->LogFileEndTable();
1383 } // end of logfile on
1384 Lexicon->LogFileHeader("Number of letters","Entropy", "Resolution?" );
1385 bool LowEntropyFlag = false;
1386 int LargestSizeChunkToPullOffStem = 0;
1387 // Use entropy to see how many letters to consider shifting
1388 // XXX. Make this user-changeable.
1389 const double EntropyThreshold = 1.5;
1390 const int LengthToConsiderShifting = 4;
1391 for (int n = 1; n <= LengthToConsiderShifting; ++n) {
1392 const double Entropy = ComputeFinalNgramEntropyOfStems(n);
1394 if (Entropy < 0) {
1395 // Negative entropy:
1396 // stem too short to consider shortening.
1397 Lexicon->LogFile("", "", "No reanalysis");
1398 continue;
1401 if (Entropy >= EntropyThreshold) {
1402 Lexicon->LogFileSimpleString ("");
1403 Lexicon->LogFileSimpleDouble(Entropy);
1404 Lexicon->LogFileSimpleString("Entropy too large.");
1405 break;
1408 // set of n-suffixes of stems has low entropy:
1409 // maybe stems have a common suffix that should be
1410 // incorporated into the signature.
1411 LowEntropyFlag = true;
1412 LargestSizeChunkToPullOffStem = n;
1413 Lexicon->LogFile(n, Entropy, "Entropy sufficiently small.");
1414 } //end of loop on n
1415 Lexicon->LogFileEndTable();
1416 if (!LowEntropyFlag)
1417 // Not enough stems share common endings to restructure,
1418 // so leave this signature alone.
1419 return 0;
1421 const bool analyzingSuffixes = !is_initial(GetAffixLocation());
1423 const double TotalNumberOfAnalyzedWords =
1424 Lexicon->GetSignatures()->GetTotalNumberOfWords();
1425 const double LogTotalNumberOfAnalyzedWords =
1426 base2log(TotalNumberOfAnalyzedWords);
1427 const double LengthOfPointerToThisSig =
1428 LogTotalNumberOfAnalyzedWords -
1429 base2log(Size() * GetNumberOfStems());
1431 // Description length of the original analysis
1432 double CurrentDL;
1434 // DL of this signature:
1436 // a. Length of pointers to its suffixes; var: LengthOfPointersToAllAffixesOfSig
1437 // b. Prorated responsibility for phonological content of suffixes
1438 // var: TotalResponsibilityForAffixListings
1439 // c. List of pointers from each stem to this signature
1440 // var: StemPointersToThisSig;
1441 // d. List of pointers from each word to its suffix
1443 // Compute DL of 'original' analysis.
1444 Lexicon->LogFileSmallTitle ("Description length of current signature");
1445 Lexicon->LogFileHeader("Affix", "Use count", "Pointer to this affix"); ;
1447 double LengthOfPointersToAllAffixesOfSig = 0.0;
1448 double TotalResponsibilityForAffixListings = 0.0;
1449 // for each suffix (resp. prefix) in this signature:
1450 for (int affixno = 1; affixno <= Size(); ++affixno) {
1451 QString Affix = GetPiece(affixno).Display();
1452 CAffix* pAffix = analyzingSuffixes
1453 ? implicit_cast<CAffix*>(
1454 *Lexicon->GetSuffixes() ^= Affix)
1455 : implicit_cast<CAffix*>(
1456 *Lexicon->GetPrefixes() ^= Affix);
1458 // Length of pointers to affixes
1459 // part a
1460 const double LengthOfPointerToThisAffix =
1461 LogTotalNumberOfAnalyzedWords -
1462 base2log(pAffix->GetUseCount());
1463 LengthOfPointersToAllAffixesOfSig +=
1464 LengthOfPointerToThisAffix;
1466 Lexicon->LogFile(Affix, pAffix->GetUseCount(), LengthOfPointerToThisAffix);
1468 // use count of affix; length of pointer to this affix.
1469 // Assign partial responsibility for this signature's
1470 // suffixes' entries.
1471 // part b.
1472 const double LocalProportion =
1473 double(GetNumberOfStems()) / pAffix->GetUseCount();
1474 const double ResponsibilityForThisAffixListing =
1475 LocalProportion * Affix.length() * base2log(26);
1476 TotalResponsibilityForAffixListings +=
1477 ResponsibilityForThisAffixListing; // in *bits*
1478 }// end of affixno loop
1480 Lexicon->LogFileEndTable();
1481 Lexicon->LogFileStartTable();
1482 Lexicon->LogFile("Part 1: Length of pointer to affixes", LengthOfPointersToAllAffixesOfSig);
1483 Lexicon->LogFile("Part 2: Prorated responsibility for phonology of affixes:", TotalResponsibilityForAffixListings);
1485 // part c.
1486 const double StemPointersToThisSig =
1487 GetNumberOfStems() * LengthOfPointerToThisSig;
1489 // In sum:
1490 const double total_dl =
1491 LengthOfPointersToAllAffixesOfSig +
1492 TotalResponsibilityForAffixListings +
1493 StemPointersToThisSig;
1494 Lexicon->LogFile("Part 3: Stem poionters to this sig:", StemPointersToThisSig);
1495 Lexicon->LogFile("Length of 1 pointer to this sig: ", LengthOfPointerToThisSig);
1496 Lexicon->LogFile("Total", total_dl);
1497 Lexicon->LogFileEndTable();
1498 CurrentDL = total_dl;
1500 double WinningDL = CurrentDL;
1501 int WinningLengthOfStemToShift = 0;
1503 // We might shift only those stems for which the EndPiece
1504 // occurs in more than 45% of the stems of this sig (that
1505 // leaves open the case of two closely related letters
1506 // comprising almost all of the cases).
1507 // But for now, we're not doing that.
1509 // The outer loop here is for the case where the entropy test
1510 // tells us that 2 or more letters can be shifted
1511 // (e.g., sig on.ve can be shifted either to ion.ive or
1512 // tion.tive), and we want to evaluate both.
1514 // Major loop through alternatives to the current signature
1515 CParse WinningSig;
1516 // loop through different lengths to shift:
1517 for (int NumberOfLettersShifted = LargestSizeChunkToPullOffStem;
1518 NumberOfLettersShifted > 0;
1519 --NumberOfLettersShifted) {
1521 TCollection<CLParse> EndPieces;
1522 foreach (CStem* pStem, *m_StemPtrList) {
1523 if (pStem->GetKeyLength() <= NumberOfLettersShifted)
1524 continue;
1526 CStringSurrogate stem_text = pStem->GetKey();
1527 CStringSurrogate ssPiece = analyzingSuffixes
1528 ? stem_text.Right(NumberOfLettersShifted)
1529 : stem_text.Left(NumberOfLettersShifted);
1530 EndPieces << ssPiece;
1533 // XXX. The function is supple enough to move material
1534 // from the stem to the affix in some cases but not in others.
1536 double AllNewSigsAnalysisDL = 0.0;
1537 double TotalDecreaseInDLDueToShorterStems = 0.0;
1538 // each of these is a distinct piece being, perhaps,
1539 // transferred from stem(s) to affixes
1540 // for each string of this length that would have to be shifted:
1541 CParse Sig;
1542 for (int pieceno = 0; pieceno < EndPieces.GetCount(); ++pieceno) {
1543 CLParse* pPiece = EndPieces.GetAt(pieceno);
1545 // make a copy to play with.
1546 Sig = *this;
1548 if (analyzingSuffixes)
1549 Sig.PrefixToAllPieces2(pPiece->GetKey());
1550 else
1551 Sig.SuffixToAllPieces2(pPiece->GetKey());
1553 // DL of this signature:
1555 // a. Length of pointers to its suffixes;
1556 // var: LengthOfPointersToAllAffixesOfSig
1557 // b. Prorated responsibility for phonological
1558 // content of suffixes
1559 // var: TotalResponsibilityForAffixListings
1560 // c. List of pointers from each stem to this
1561 // signature
1562 // var: PointersToThisSig;
1563 // d. Savings because stems already existed
1564 // var: SavingsBecauseStemAlreadyExisted
1565 // e. Savings because stems are shorter
1566 // var: TotalDecreaseInDLDueToShorterStems :
1567 // once for each *length* being shifted from
1568 // stem to suffix
1569 // f. List of pointers from each word to its
1570 // suffix
1571 // XXX. not implemented.
1573 double LengthOfPointersToAllAffixesOfSig = 0.0;
1574 double TotalResponsibilityForAffixListings = 0.0;
1575 if (*Lexicon->GetSignatures() ^= Sig) {
1576 // new signature already exists
1577 Lexicon->LogFileSmallTitle("Alternative analysis already existed", Sig.Display('-'));
1578 // XXX. address this case!
1579 } else {
1580 // new signature
1581 Lexicon->LogFileSmallTitle("Conjectured signature: " + Sig.Display('-'));
1582 Lexicon->LogFileStartTable();
1583 // iterate through suffixes of the signature
1584 Lexicon->LogFileHeader("Suffix", "Previous count", "New count", "Pointer length to this affix", "Responsibility for this affix (phonology) in bits:", "New DL for this affix");
1585 double ThisNewSigDL = 0.0;
1586 // for each suffix (resp prefix) in the new sig:
1587 for (int affixno = 1; affixno <= Size(); ++affixno) {
1588 CStringSurrogate ssAffix =
1589 Sig.GetPiece(affixno);
1591 CAffix* pAffix = analyzingSuffixes
1592 ? implicit_cast<CAffix*>(
1593 *Lexicon->GetSuffixes() ^= ssAffix)
1594 : implicit_cast<CAffix*>(
1595 *Lexicon->GetPrefixes() ^= ssAffix);
1596 double sum;
1597 if (pAffix != 0) {
1598 const double ResponsibilityForThisAffixListing =
1599 double(ssAffix.GetLength()) * base2log(26) *
1600 GetNumberOfStems() /
1601 (double(GetNumberOfStems()) +
1602 pAffix->GetUseCount());
1603 const double LengthOfPointerToThisAffix =
1604 LogTotalNumberOfAnalyzedWords -
1605 base2log(pAffix->GetUseCount() +
1606 GetNumberOfStems());
1608 TotalResponsibilityForAffixListings +=
1609 ResponsibilityForThisAffixListing;
1610 LengthOfPointersToAllAffixesOfSig +=
1611 LengthOfPointerToThisAffix;
1613 sum = ResponsibilityForThisAffixListing +
1614 LengthOfPointerToThisAffix;
1615 Lexicon->LogFile (ssAffix.Display(), pAffix->GetUseCount(), GetNumberOfStems() + pAffix->GetUseCount(), LengthOfPointerToThisAffix, ResponsibilityForThisAffixListing, sum);
1617 } else {
1618 // new affix
1619 const double ResponsibilityForThisAffixListing =
1620 double(ssAffix.GetLength()) * base2log(26);
1621 const double LengthOfPointerToThisAffix =
1622 LogTotalNumberOfAnalyzedWords -
1623 base2log(GetNumberOfStems());
1625 LengthOfPointersToAllAffixesOfSig +=
1626 LengthOfPointerToThisAffix;
1627 TotalResponsibilityForAffixListings +=
1628 ResponsibilityForThisAffixListing;
1629 sum = ResponsibilityForThisAffixListing +
1630 LengthOfPointerToThisAffix;
1631 Lexicon->LogFile(ssAffix.Display(), 0, GetNumberOfStems(), LengthOfPointerToThisAffix, ResponsibilityForThisAffixListing, sum);
1632 } //end of else
1633 ThisNewSigDL += sum;
1635 Lexicon->LogFile("Total", 0, 0, LengthOfPointersToAllAffixesOfSig, TotalResponsibilityForAffixListings, ThisNewSigDL);
1636 Lexicon->LogFileEndTable();
1639 // Length of the pointers to the sig from its stems:
1640 double SavingsBecauseStemAlreadyExisted = 0.0;
1641 double StemPointersToThisSig;
1642 IterateThroughStems(NumberOfLettersShifted,
1643 Lexicon,
1644 pPiece,
1645 TotalDecreaseInDLDueToShorterStems,
1646 LogTotalNumberOfAnalyzedWords,
1647 StemPointersToThisSig,
1648 SavingsBecauseStemAlreadyExisted,
1649 analyzingSuffixes);
1650 const double ThisNewSigDL =
1651 LengthOfPointersToAllAffixesOfSig +
1652 TotalResponsibilityForAffixListings +
1653 StemPointersToThisSig +
1654 -SavingsBecauseStemAlreadyExisted +
1655 -TotalDecreaseInDLDueToShorterStems;
1656 AllNewSigsAnalysisDL += ThisNewSigDL;
1657 Lexicon->LogFile("Part 1: Length of pointer to affixes: ", LengthOfPointersToAllAffixesOfSig);
1658 Lexicon->LogFile("Part 2: Prorated responsibility for phonology of affixes: ", TotalResponsibilityForAffixListings);
1659 Lexicon->LogFile("Part 3: Stem pointers to this sig:", StemPointersToThisSig);
1660 Lexicon->LogFile("Length of 1 poitner to this sig: ", LengthOfPointerToThisSig);
1661 Lexicon->LogFile("Part 4: Total savings from stems that had already existed", SavingsBecauseStemAlreadyExisted);
1662 Lexicon->LogFile("Part 5: Total decrease in DL due to shorter stems: ", TotalDecreaseInDLDueToShorterStems);
1663 Lexicon->LogFile("Total DL: ", ThisNewSigDL);
1665 if (Lexicon->LogFileOn()) *Lexicon->GetLogFile() <<
1666 "<br /><br />" <<
1667 QString("If we add %1 letters, total TD is %2")
1668 .arg(NumberOfLettersShifted).arg(AllNewSigsAnalysisDL) <<
1669 endl << "******" << endl <<
1670 "<br />";
1672 if (AllNewSigsAnalysisDL < WinningDL) {
1673 WinningDL = AllNewSigsAnalysisDL;
1674 WinningLengthOfStemToShift = NumberOfLettersShifted;
1675 WinningSig = Sig;
1679 if (WinningDL != CurrentDL) {
1680 if (Lexicon->LogFileOn()) *Lexicon->GetLogFile() <<
1681 SmallTitle(QString(
1682 "Change signature from \"%1\" to \"%2\"")
1683 .arg(Display(), WinningSig.Display('.'))) <<
1684 "<hr />";
1685 Lexicon->AddToScreen(
1686 QString("%1 >> %2")
1687 .arg(Display('.'), WinningSig.Display('.')));
1688 return WinningLengthOfStemToShift;
1689 } else {
1690 if (Lexicon->LogFileOn()) *Lexicon->GetLogFile() <<
1691 SmallTitle(QString(
1692 "%1: Conclusion: Keep original signature.")
1693 .arg(Display())) <<
1694 "<hr />";
1695 return 0;
1698 // <<-------------------------------------------------------------------------------------------------------->>
1699 void CSignature::IterateThroughStems( int NumberOfLettersShifted,
1700 CMiniLexicon* Lexicon,
1701 CLParse* pPiece,
1702 double& TotalDecreaseInDLDueToShorterStems,
1703 double LogTotalNumberOfAnalyzedWords,
1704 double& StemPointersToThisSig,
1705 double& SavingsBecauseStemAlreadyExisted,
1706 bool analyzingSuffixes)
1711 CStem* pStem;
1712 int HowManyStemsForThisSig = 0; //check that
1713 int NumberOfShortenedStemsThatPreExisted = 0;
1714 double ThisSavingBecauseStemAlreadyExisted = 0;
1715 double DecreaseInDLDueToShorterStems = 0;
1716 double LengthOfPointerToThisSig = 0;
1717 CSS ssNewStem;
1719 TotalDecreaseInDLDueToShorterStems = 0;
1720 SavingsBecauseStemAlreadyExisted = 0;
1722 Lexicon->LogFile (pPiece->Display() );
1723 Lexicon->LogFileHeader( "Current stem", "Proposed stem", "Savings from preexisting stem");
1726 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
1728 pStem = m_StemPtrList->at(stemno);
1729 ThisSavingBecauseStemAlreadyExisted =0;
1730 int StemLength = pStem->GetKeyLength();
1731 ssNewStem = pStem->GetKey().Left(
1732 StemLength - NumberOfLettersShifted);
1733 Lexicon->LogFileStartRow();
1734 if ( analyzingSuffixes ) // Suffixes
1736 if ( pStem->GetKey().Right(NumberOfLettersShifted).Display() == pPiece->Display() )
1738 HowManyStemsForThisSig++;
1739 Lexicon->LogFile1SimpleString(pStem->Display());
1740 Lexicon->LogFile1SimpleString(ssNewStem.Display());
1743 else
1745 Lexicon->LogFile1SimpleString(pStem->Display());
1746 Lexicon->LogFile1SimpleString(ssNewStem.Display());
1747 continue;
1749 ssNewStem = pStem->GetKey().Left( pStem->GetKeyLength() - NumberOfLettersShifted );
1751 else // Prefixes
1753 if ( pStem->GetKey().Left(NumberOfLettersShifted).Display() == pPiece->Display() )
1755 HowManyStemsForThisSig++;
1757 else
1759 continue;
1761 ssNewStem = pStem->GetKey().Right( pStem->GetKeyLength() - NumberOfLettersShifted );
1766 if ( Lexicon->GetStems()->Contains( ssNewStem ) || // ** Was: "GetStems_Suffixed
1767 Lexicon->GetWords()->Contains( ssNewStem ) )
1769 NumberOfShortenedStemsThatPreExisted ++;
1770 ThisSavingBecauseStemAlreadyExisted = ssNewStem.GetLength() * base2log (26);
1771 SavingsBecauseStemAlreadyExisted += ThisSavingBecauseStemAlreadyExisted;
1773 // ** Add the cost of having a pointer to the stem ******
1777 if ( Lexicon->LogFileOn() &&
1778 ( pStem->GetKey().Right(NumberOfLettersShifted).Display() == pPiece->Display() ) )
1781 if ( ThisSavingBecauseStemAlreadyExisted > 0)
1783 Lexicon->LogFileSimpleString("ThisSavingBecauseStemAlreadyExisted");
1784 } else
1786 Lexicon->LogFileSimpleString("none (did not exist)");
1789 Lexicon->LogFileEndRow();
1792 DecreaseInDLDueToShorterStems = ( HowManyStemsForThisSig - NumberOfShortenedStemsThatPreExisted ) *
1793 NumberOfLettersShifted * base2log (26);
1794 TotalDecreaseInDLDueToShorterStems += DecreaseInDLDueToShorterStems ;
1797 LengthOfPointerToThisSig = LogTotalNumberOfAnalyzedWords - base2log ( Size() * HowManyStemsForThisSig ) ;
1798 StemPointersToThisSig = HowManyStemsForThisSig * ( LengthOfPointerToThisSig ) ;
1799 if ( Lexicon-> LogFileOn() )
1801 *Lexicon->GetLogFile() << // FILL THIS IN --
1803 StartTable <<
1804 StartTableRow <<
1805 MakeTableHeader("Current stem") <<
1806 MakeTableHeader("Proposed stem") <<
1807 MakeTableHeader("Savings from preexisting stem") <<
1808 EndTableRow;
1814 bool CSignature::IsValid()
1815 // tests that pieces of the signature are all non-null
1816 { for (int affixno = 1; affixno <= m_PieceCount; affixno++) {
1817 if ( GetPiece(affixno).GetLength() < 1 ) {
1818 return FALSE;
1821 return TRUE;
1823 // <<-------------------------------------------------------------------------------------------------------->>
1824 void CSignature::DetachStem(CStem* pStem, detachment_parameter Parameter)
1826 if( !m_StemPtrList->isEmpty() &&
1827 m_StemPtrList->indexOf( pStem ) >= 0 &&
1828 m_StemPtrList->remove( pStem ) )
1830 IncrementCorpusCount( -1 * pStem->GetCorpusCount() );
1832 if (Parameter != eDo_Not_Call_Words) {
1833 CStem *pWord;
1834 for (int wordno = 0; wordno < pStem->GetNumberOfWords(); wordno++) {
1835 pWord = pStem->GetWordPtrList()->at(wordno);
1836 const int index = m_WordPtrList->indexOf(pWord);
1837 if (index >= 0)
1838 m_WordPtrList->removeAt(index);
1843 // <<-------------------------------------------------------------------------------------------------------->>
1844 void CSignature::DetachWord(CStem* pWord, enum detachment_parameter param)
1846 struct not_implemented { };
1847 throw not_implemented();
1849 // Suppress warnings.
1850 static_cast<void>(pWord);
1851 static_cast<void>(param);
1853 // <<-------------------------------------------------------------------------------------------------------->>
1854 void CSignature::TakeAllStems(CSignature* source)
1856 //QList<CStem*>& source_stems = *source->GetStemPtrList();
1857 CStem* pStem;
1858 for (int stemno = 0; stemno < source->GetNumberOfStems(); stemno++)
1860 pStem=source->GetStem(stemno);;
1861 pStem->SetSuffixList(this);
1862 AppendStemPtr(pStem);
1863 IncrementCorpusCount(pStem->GetCorpusCount());
1865 // Remove items from source.
1866 //Q_ASSERT(!source_stems.autoDelete());
1867 //source_stems.clear();
1868 source->ClearStemPtrList();
1870 // XXX. Decrement source corpus count in turn?
1871 // Hard to tell, since there are no call sites.
1873 // <<-------------------------------------------------------------------------------------------------------->>
1874 void CSignature::AddWord (CStem* pWord)
1876 m_WordPtrList->append (pWord);
1877 IncrementCorpusCount (pWord->GetCorpusCount() );
1880 void CSignature::ClearStemPtrList() { m_StemPtrList->clear(); }
1881 void CSignature::AppendWordPointer(CStem* pWord ) { m_WordPtrList->append(pWord); }
1882 void CSignature::AppendPrefixPtr(CPrefix* pPrefix) { m_PrefixPtrList->append (pPrefix);}
1883 int CSignature::GetNumberOfWords() const
1885 return m_WordPtrList->count();
1888 // <<-------------------------------------------------------------------------------------------------------->>
1889 CParse CSignature::CreateADeletingSignature( CParse& Deletee, CMiniLexicon* Lexicon )
1891 CStringSurrogate ssSuffix;
1894 CParse PSuffix,
1895 NewSig,
1896 Suffix;
1897 CSuffix* pSuffix;
1898 QString Null = "NULL", lt_brak = "<", rt_brak = "<";
1901 Q_ASSERT (Deletee.Size() == 1);
1903 for (int affixno = 1; affixno <= Size(); affixno++)
1905 ssSuffix = GetPiece(affixno);
1906 if(NewSig.GetSortStyle() != eAlphabetized ) NewSig.Alphabetize();
1907 if ( ssSuffix == Deletee )
1909 NewSig.Append ( CStringSurrogate(Null.unicode(),0,Null.length() ) );
1911 else
1913 PSuffix = CStringSurrogate(lt_brak.unicode(),0,1);
1914 PSuffix += Deletee;
1915 PSuffix += CStringSurrogate(rt_brak.unicode(),0,1);
1916 PSuffix.ClearParseStructure();
1917 PSuffix += ssSuffix;
1918 NewSig.Append ( PSuffix.GetKey() );
1920 pSuffix = *Lexicon->GetSuffixes() << PSuffix;
1922 QString line = "<" + Deletee.Display() + ">" + ssSuffix.Display();
1923 Suffix = CStringSurrogate( line.unicode(),0,line.length());
1925 NewSig.Append (Suffix.GetKey());
1926 // Lexicon->SetSuffixTranslation(this, ssSuffix, Suffix);
1930 return NewSig;
1933 // <<-------------------------------------------------------------------------------------------------------->>
1934 bool CSignature::RemoveStem(CStem * pStem )
1936 return m_StemPtrList->remove( pStem );
1938 // <<-------------------------------------------------------------------------------------------------------->>
1941 bool CSignature::RemoveWord(CStem* pWord)
1943 return m_WordPtrList->remove( pWord );
1945 // <<-------------------------------------------------------------------------------------------------------->>
1946 // copy out affixes, with null affix replaced with "NULL",
1947 // possibly with deletees marked with angle brackets
1948 CSignature& CSignature::Express(CSignature& Output, bool bDisplayDeletees)
1950 CSuffixCollection* Suffixes = 0;
1951 CPrefixCollection* Prefixes = 0;
1952 if (!is_initial(GetAffixLocation()))
1953 Suffixes = GetSignatureCollection()->GetMySuffixes();
1954 else
1955 Prefixes = GetSignatureCollection()->GetMyPrefixes();
1957 Output.ClearParse();
1959 for (int affixno = 1; affixno <= Size(); ++affixno) {
1960 CStringSurrogate affix_text = GetPiece(affixno);
1962 if (affix_text.IsNULL()) {
1963 Output.Append(TheStringNULL);
1964 continue;
1966 if (!is_initial(m_AffixLocation)) {
1967 CSuffix* suffix = *Suffixes ^= affix_text;
1968 Q_ASSERT(suffix != 0);
1970 CParse Temp;
1971 Output.Append(
1972 suffix->Express(Temp, bDisplayDeletees));
1973 } else {
1974 CPrefix* prefix = *Prefixes ^= affix_text;
1975 Q_ASSERT(prefix != 0);
1977 CParse Temp;
1978 Output.Append(
1979 prefix->Express(Temp, bDisplayDeletees));
1982 return Output;
1984 // <<-------------------------------------------------------------------------------------------------------->>
1985 /// concatenate affixes, separated by -.
1986 QString CSignature::Express(bool bDisplayDeletees)
1988 CSuffixCollection* Suffixes = 0;
1989 CPrefixCollection* Prefixes = 0;
1990 if (!is_initial(GetAffixLocation()))
1991 Suffixes = GetSignatureCollection()->GetMySuffixes();
1992 else
1993 Prefixes = GetSignatureCollection()->GetMyPrefixes();
1995 QString Outstring;
1996 for (int affixno = 1; affixno <= Size(); ++affixno) {
1997 CStringSurrogate affix_text = GetPiece(affixno);
1999 if (affix_text.IsNULL()) {
2000 if (!Outstring.isEmpty())
2001 Outstring.append('-');
2002 Outstring.append(TheStringNULL);
2003 continue;
2006 if (is_initial(m_AffixLocation)) {
2007 CPrefix* prefix = *Prefixes ^= affix_text;
2008 Q_ASSERT(prefix != 0);
2009 if (!Outstring.isEmpty())
2010 Outstring.append('-');
2012 CParse Temp;
2013 Outstring.append(prefix->Express(Temp,
2014 bDisplayDeletees).Display());
2015 } else {
2016 CSuffix* suffix = *Suffixes ^= affix_text;
2017 Q_ASSERT(suffix != 0);
2018 if (!Outstring.isEmpty())
2019 Outstring.append('-');
2021 CParse Temp;
2022 Outstring.append(suffix->Express(Temp,
2023 bDisplayDeletees).Display());
2027 return Outstring;
2029 // <<-------------------------------------------------------------------------------------------------------->>
2031 // this should probably be replaced by ComputeDLofModel
2033 double CSignature::ComputeDL( int char_count )
2035 CStem* pStem;
2036 CAffix* pAffix;
2038 CStringSurrogate Affix;
2040 bool CORPUS_BASED_AFFIX_COUNT = m_pMyMini->GetIntParameter( "SignatureDL\\CorpusBasedAffixCount", 0 );
2041 bool CORPUS_BASED_STEM_COUNT = m_pMyMini->GetIntParameter( "SignatureDL\\CorpusBasedStemCount", 1 );
2043 double stems_dl = 0.0,
2044 affixes_dl = 0.0;
2046 uint stem_total = 0,
2047 affix_total = 0;
2049 if( CORPUS_BASED_STEM_COUNT )
2051 for( pStem = m_StemPtrList->first(); pStem; pStem = m_StemPtrList->next() )
2053 stems_dl += ( (double) -1 ) * base2log( (double) pStem->GetCorpusCount() / (double) m_pMyMini->GetCorpusCount() );
2056 else
2058 for( pStem = m_StemPtrList->first(); pStem; pStem = m_StemPtrList->next() )
2060 stems_dl = ( (double) -1 ) * base2log( (double) pStem->GetWordPtrList()->count() / (double) m_pMyMini->GetWords()->GetCount() );
2064 bool analyzedSuffixes = TRUE;
2065 if( GetAffixLocation() == STEM_INITIAL || GetAffixLocation() == WORD_INITIAL ) analyzedSuffixes = FALSE;
2067 int i;
2068 if( !CORPUS_BASED_AFFIX_COUNT )
2070 for( i = 1; i <= m_PieceCount; i++ )
2072 Affix = GetPiece(i);
2074 if( analyzedSuffixes )
2076 pAffix = *m_pMyMini->GetSuffixes() ^= Affix;
2078 else
2080 pAffix = *m_pMyMini->GetPrefixes() ^= Affix;
2083 if( pAffix ) affix_total += pAffix->GetCorpusCount();
2087 for( i = 1; i <= m_PieceCount; i++ )
2089 Affix = GetPiece(i);
2091 if( analyzedSuffixes )
2093 pAffix = *m_pMyMini->GetSuffixes() ^= Affix;
2095 else
2097 pAffix = *m_pMyMini->GetPrefixes() ^= Affix;
2100 if( pAffix )
2102 if( CORPUS_BASED_AFFIX_COUNT ) affixes_dl += ( (double) -1 ) * base2log( (double) pAffix->GetCorpusCount() / (double) m_pMyMini->GetCorpusCount() );
2103 else affixes_dl += ( (double) -1 ) * base2log( (double) pAffix->GetCorpusCount() / (double) affix_total );
2107 return stems_dl + affixes_dl;
2110 // <<-------------------------------------------------------------------------------------------------------->>
2111 //====================================================================//
2112 // Description Length //
2113 //====================================================================//
2114 double CSignature::GetDLofMyAffixPointers( )
2116 if (m_DLofMyAffixPointers == 0)
2118 bool analyzedSuffixes = TRUE;
2119 CSuffix * pSuffix;
2120 CPrefix* pPrefix;
2121 if( GetAffixLocation() == STEM_INITIAL || GetAffixLocation() == WORD_INITIAL ) analyzedSuffixes = FALSE;
2122 if (analyzedSuffixes)
2124 for (int suffixno = 0; suffixno < GetSuffixPtrList()->size(); suffixno++)
2125 { pSuffix= GetSuffixPtrList()->at(suffixno);
2126 m_DLofMyAffixPointers += pSuffix->GetLengthOfPointerToMe ();
2129 else
2131 for (int prefixno = 0; prefixno < GetPrefixPtrList()->size(); prefixno++)
2133 pPrefix= GetPrefixPtrList()->at(prefixno);
2134 m_DLofMyAffixPointers += pPrefix->GetLengthOfPointerToMe ();
2138 return m_DLofMyAffixPointers;
2140 // <<-------------------------------------------------------------------------------------------------------->>
2141 double CSignature::GetDLofMyStemPointers()
2143 if (m_DLofMyStemPointers == 0)
2145 CStem * pStem;
2146 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++)
2148 pStem = GetStem(stemno);
2149 m_DLofMyStemPointers += pStem->GetLengthOfPointerToMe ();
2152 return m_DLofMyStemPointers;
2154 // <<-------------------------------------------------------------------------------------------------------->>
2155 double CSignature::ComputeDLofModel(int /* char_count, not used */)
2157 // XXX. take SignatureDL\CorpusBased{Stem,Affix}Count parameters
2158 // into account
2160 m_DLofMyStemPointers = GetDLofMyStemPointers();
2161 m_DLofMyAffixPointers = GetDLofMyAffixPointers();
2162 return m_DLofMyStemPointers + m_DLofMyAffixPointers;
2164 // <<-------------------------------------------------------------------------------------------------------->>
2165 double CSignature::ComputeDLofMyCorpus()
2167 using linguistica::implicit_cast;
2169 if (m_pMyMini == 0)
2170 return 0.0;
2172 m_DLofMyCorpus = 0.0;
2173 foreach (CStem* pWord, *m_WordPtrList) {
2174 CStringSurrogate stem_text = pWord->GetStem();
2175 CStem* stem = *m_pMyMini->GetStems() ^= stem_text;
2177 /***** DEBUG******/
2178 if(stem==NULL)
2180 std::cout << "NULL stem -- in CSignature::ComputeDLofMyCorpus() "<< std::endl;
2181 std::cout << " word: "<<pWord->Display().toStdString()<< std::endl;
2182 std::cout << " stem: "<< stem_text.Display().toStdString()<<std::endl;
2183 CStringSurrogate afx_str
2184 = (is_initial(m_AffixLocation) ? pWord->GetPrefix() : pWord->GetSuffix());
2185 std::cout << " affix:"<< afx_str.Display().toStdString() << std::endl;
2186 std::cout << std::endl;
2187 Q_ASSERT(stem);
2189 /* end DEBUG-s.w.*/
2191 CStringSurrogate affix_text = is_initial(m_AffixLocation)
2192 ? pWord->GetPrefix()
2193 : pWord->GetSuffix();
2194 if (affix_text.GetLength() == 0)
2195 affix_text = TheStringNULL;
2197 CAffix* affix = is_initial(m_AffixLocation)
2198 ? implicit_cast<CAffix*>(
2199 *m_pMyMini->GetPrefixes() ^= affix_text)
2200 : implicit_cast<CAffix*>(
2201 *m_pMyMini->GetSuffixes() ^= affix_text);
2203 CStem* word = *m_pMyMini->GetWords() ^= pWord;
2204 const double ThisWordDL =
2205 stem->GetLengthOfPointerToMe() +
2206 affix->GetLengthOfPointerToMe();
2207 m_DLofMyCorpus += word->GetCorpusCount() * ThisWordDL;
2209 return m_DLofMyCorpus;
2211 // <<-------------------------------------------------------------------------------------------------------->>
2213 namespace {
2214 /// Get the corpus counts of each suffix with this stem
2215 int* GetSuffixCounts(CStem* stem, int* output)
2217 if (output) delete output; // error if this occurs.
2218 output = new int[ stem->GetNumberOfSuffixes() ];
2220 for (int i = 1; i <= stem->GetSuffixList()->Size(); ++i) {
2221 QString Suffix = stem->GetSuffixList()->GetPiece(i).Display();
2222 if (Suffix == "NULL")
2223 Suffix = "";
2224 QString Word = stem->Display() + Suffix;
2225 CStem* pWord = *stem->GetMyMini()->GetWords() ^=
2226 CStringSurrogate(Word);
2228 output[i-1] = pWord->GetCorpusCount();
2230 return output;
2234 //the output is a vector of integers, whose length is
2235 // the number of stems times the number of suffixes. Pass it
2236 // an int pointer that points to NULL; it will delete the memory
2237 // that this function creates.
2238 int* CSignature::GetIndividualCountsForEachStem (int* output )
2240 int affixno, stemno;
2241 int* temp = NULL;
2242 CStem* pStem;
2244 if (output) delete output; //if this occurs, it's an error.
2245 output = new int [GetNumberOfStems() * GetNumberOfAffixes() ];
2247 CMiniLexicon* pMiniLexicon = GetLexicon();
2248 NOT FINISHED YET _--- use GETaWord -- JG
2249 for (stemno = 0; stemno < m_StemPtrList->size(); stemno++)
2251 pSt em = m_StemPtrList->at(stemno);
2252 temp = GetSuffixCounts(pStem, temp);
2253 for (affixno = 0; affixno < GetNumberOfAffixes(); affixno++)
2255 output[stemno * GetNumberOfAffixes() + affixno] = temp[affixno];
2257 delete temp;
2258 temp = NULL;
2260 return output;
2264 //===================================================================================================//
2266 // Description length
2268 //===================================================================================================//
2269 double CSignature::GetSumOfDLofInternalPointers()
2272 double StemTotal = 0, SuffixTotal = 0;
2273 CStem* pStem;
2274 CSuffix* pSuffix;
2275 CSS ssSuffix;
2276 CSuffixCollection& Suffixes = *m_pMyMini->GetSuffixes();
2277 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
2279 pStem = m_StemPtrList->at(stemno);
2280 StemTotal += pStem->GetLengthOfPointerToMe_2 ();
2283 for (int affixno = 1; affixno <= GetNumberOfAffixes(); affixno++)
2285 ssSuffix = GetPiece(affixno);
2286 pSuffix = Suffixes ^= ssSuffix;
2287 SuffixTotal += pSuffix->GetLengthOfPointerToMe();
2289 return StemTotal + SuffixTotal;
2291 // <<-------------------------------------------------------------------------------------------------------->>
2293 void CSignature::SetLengthOfPointerToMe(double L)
2295 m_LengthOfPointerToMe = L;
2296 return;
2299 // <<-------------------------------------------------------------------------------------------------------->>
2301 void CSignature::AppendSatelliteAffix(CParse& suffix)
2304 m_SatelliteAffixes.Append(suffix);
2307 //===================================================================================================//
2309 // Allomorphy
2311 //===================================================================================================//
2312 bool CSignature::Generalizes(CSignature* pSig)
2314 struct not_implemented { };
2315 throw not_implemented();
2317 // 1. Check they have the same length; find which one is longer.
2318 // 2. Go from longest to shortest pieces of the longer signature:
2319 // look for unambiguous correspondents in the other signature, and
2320 // put those pairs of corresponding affixes in some structure.
2321 // 3. After unambiguous cases, deal with ambiguous cases, if any exist.
2322 // 4. Find alignment
2324 // ed |NULL | NULL | ed |
2325 // ing|NULL | NULL | ing |
2326 // es |e | NULL | s |
2327 // e |e | NULL | NULL|
2330 // ed |e | <e> | ed |
2331 // ing|e | <e> | ing |
2332 // es |e | NULL | s |
2333 // e |e | NULL | NULL|
2336 // ien |ien | NULL | NULL |
2337 // ienne |ienn | NULL | e |
2338 // iens |ien | NULL | s |
2339 // iennes |ienn | NULL | es |
2341 // ien |ien | NULL | NULL |
2342 // ienne |ien | n | e |
2343 // iens |ien | NULL | s |
2344 // iennes |ien | n | es |
2346 CSignature* LongerSig, *ShorterSig;
2348 struct Row {
2349 QString LongAffix;
2350 QString Extension;
2351 QString Operation;
2352 QString ShortAffix;
2355 if (Size() != pSig->Size())
2356 return false;
2358 const int dif = GetKeyLength() - pSig->GetKeyLength();
2359 if (dif > 0) {
2360 LongerSig = this; ShorterSig = pSig;
2361 } else if (dif == 0) {
2362 return false;
2363 } else {
2364 LongerSig = pSig; ShorterSig = this;
2367 const int MAXAFFIXSIZE = 10;
2369 QStringList ShorterSigPieces;
2372 // Copy the affixes of ShorterSig,
2373 // from shortest to longest
2374 // onto the list ShorterSigPieces.
2375 if (ShorterSig->ContainsNULL())
2376 ShorterSigPieces.append(TheStringNULL);
2377 for (int m = 1; m < MAXAFFIXSIZE &&
2378 ShorterSigPieces.count() < ShorterSig->Size();
2379 ++m) {
2380 // XXX. this test makes no sense
2381 if (ShorterSig->ThisPieceLength(m) == m)
2382 ShorterSigPieces.prepend(
2383 ShorterSig->GetPiece(m).Display());
2385 Q_ASSERT(ShorterSigPieces.count() == ShorterSig->Size());
2388 QStringList LongerSigPieces;
2390 // Copy the affixes of LongerSig,
2391 // from shortest to longest
2392 // onto the list LongerSigPieces.
2393 if (LongerSig->ContainsNULL())
2394 LongerSigPieces.append(TheStringNULL);
2395 for (int m = 1; m < MAXAFFIXSIZE &&
2396 LongerSigPieces.count() < LongerSig->Size();
2397 ++m)
2398 if (LongerSig->ThisPieceLength(m) == m)
2399 LongerSigPieces.prepend(
2400 LongerSig->GetPiece(m).Display());
2401 Q_ASSERT(LongerSigPieces.count() == LongerSig->Size());
2404 CStringSurrogate ssIng, ssTing;
2405 foreach (QString shortersig_piece, ShorterSigPieces) {
2406 // example: "ing"
2407 CStringSurrogate short_affix(shortersig_piece);
2408 bool match = false;
2409 foreach (QString longersig_piece, LongerSigPieces) {
2410 // example "ting"
2411 CStringSurrogate long_affix(longersig_piece);
2412 if (long_affix.IsNULL())
2413 continue;
2414 if (short_affix != long_affix.Right(
2415 short_affix.GetLength()))
2416 continue;
2417 bool unambiguous_match = !match;
2418 if (!match)
2419 match = true;
2421 if (!unambiguous_match)
2422 continue;
2424 Row ThisRow;
2425 ThisRow.LongAffix =
2426 long_affix.Display();
2427 ThisRow.ShortAffix =
2428 short_affix.Display();
2429 ThisRow.Extension = long_affix.Left(
2430 long_affix.GetLength() -
2431 short_affix.GetLength())
2432 .Display();
2433 // XXX. use ThisRow...
2434 static_cast<void>(ThisRow);
2437 return false;
2439 // <<-------------------------------------------------------------------------------------------------------->>
2440 // <<-------------------------------------------------------------------------------------------------------->>
2441 void CSignature::CutMyWordsAsIDeclare()
2442 { CStem* stem;
2444 if ( is_initial (GetAffixLocation()) )
2446 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++) {
2447 stem = GetStem(stemno);
2449 // For each prefix in signature:
2450 for (int prefixno = 1; prefixno <= Size(); ++prefixno) {
2451 CStringSurrogate prefix = GetPiece(prefixno);
2453 prefix.SetBackwards(false);
2454 if (prefix.IsNULL())
2455 // NULL + stem prefix needs no cut
2456 continue;
2458 // get correspond word
2459 CParse word_text = prefix + stem->GetKey();
2460 CStem* word = *GetLexicon()->GetWords() ^= word_text;
2461 Q_ASSERT(word != 0);
2463 if (word->Size() > 1 )
2464 // already analyzed
2465 continue;
2466 GetLexicon()->LogFile ("", "", word->GetKey().Display());
2468 // analyze word
2469 const int cut_point = word->GetKeyLength() - stem->GetKeyLength();
2470 word->CutRightBeforeHere(cut_point);
2471 word->SetStemLoc(2);
2472 word->SetPrefixLoc(1);
2473 //m_pLexicon->UpdateWord(word);
2477 else
2479 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++) {
2480 stem = GetStem(stemno);
2482 // For each affix in signature:
2483 for (int suffixno = 1; suffixno <= Size(); ++suffixno) {
2484 CStringSurrogate suffix = GetPiece(suffixno);
2486 suffix.SetBackwards(false);
2487 if (suffix.IsNULL())
2488 // stem + NULL suffix needs no cut
2489 continue;
2491 // get correspond word
2492 CParse word_text = stem->GetKey() + suffix;
2493 CStem* word = *GetLexicon()->GetWords() ^= word_text;
2494 Q_ASSERT(word != 0);
2496 if (word->Size() > 1 )
2497 // already analyzed
2498 continue;
2499 GetLexicon()->LogFile ("", "", word->GetKey().Display());
2501 // analyze word
2502 const int cut_point = word->GetKeyLength() - stem->GetKeyLength();
2503 word->CutRightBeforeHere(cut_point);
2504 word->SetStemLoc(1);
2505 //m_pLexicon->UpdateWord(word);
2511 void CSignature::OutputSignatureXfst( QTextStream& outf, int count)
2513 QString strOutput;
2514 CParse StemList;
2515 QString string;
2517 outf << endl;
2519 outf << "# " << count << ": " << Display('.', m_pMyMini->GetOutFilter()) << endl;
2520 if (this->GetMentorList()->count() > 0)
2521 outf << "# MentorList() size: " << this->GetMentorList()->count() << endl;
2522 else
2523 outf << "# No MentorList() items" << endl;
2525 outf << "# robustness: " << m_Robustness << endl;
2527 if( GetMentor()!=NULL )
2529 outf << "# Has mentor: skipping" << endl;
2530 return;
2533 outf << "define STEM" << count << " "; // << " \\" << endl;
2535 //added
2536 QStringList stems;
2537 for (int i = 0; i < this->GetNumberOfStems(); i++)
2539 stems.append( this->GetStem(i)->Display() );
2542 // add stems from child sigs
2544 for (int z = 0; z < this->GetMentorList()->size(); z++)
2546 CSignature * qSig = this->GetMentorList()->at(z);
2548 QStringList qSufList;
2549 for (int i = 0; i < qSig->GetNumberOfAffixes(); i++)
2550 qSufList.append(qSig->GetSuffix(i)->Display());
2552 //generate new words here:
2553 for (int i = 0; i < this->GetNumberOfAffixes(); i++)
2555 outf << endl;
2556 CSuffix* pSuf = this->GetSuffix(i);
2557 QString sufStr = pSuf->Display( 0 );//, m_pMyMini->GetOutFilter() );
2558 if ( !qSufList.contains(sufStr) )
2560 outf<< "#### Suffix to be expanded: "<< sufStr << endl;
2561 for (int j = 0; j < qSig->GetNumberOfStems(); j++)
2563 QString stemStr = qSig->GetStem(j)->Display();
2564 if (sufStr.compare("NULL") == 0)
2565 outf << "### "<< stemStr << endl;
2566 else
2567 outf << "### "<< stemStr << " " << sufStr << endl;
2573 // add stems from child sigs
2574 for (int z = 0; z < this->GetMentorList()->size(); z++)
2576 CSignature * qSig = this->GetMentorList()->at(z);
2577 for (int i = 0; i < qSig->GetNumberOfStems(); i++)
2579 stems.append( qSig->GetStem(i)->Display( 0, m_pMyMini->GetOutFilter() ) );
2583 stems.sort();
2584 int m = 1;
2586 QStringList::Iterator strIt = stems.begin();
2587 outf << "[ {" << *strIt << "} ";
2588 ++strIt;
2591 for( ; strIt != stems.end(); ++strIt )
2593 if( m % 5 == 0 )
2595 outf << endl;
2596 outf << " ";
2598 outf << "| {" << *strIt << "} ";
2599 m++;
2602 outf << "]; "<<endl;
2603 outf << "define SUF" << count << " [ ";
2604 QStringList suffixes;
2605 bool first = 1;
2607 for (int i = 0; i < this->GetNumberOfAffixes(); i++)
2609 CSuffix* pSuffix = this->GetSuffix(i);
2610 if(first)
2611 first=0;
2612 else
2613 outf << "|";
2614 QString str = pSuffix->Display( 0 );
2615 if (str.compare("NULL") == 0)
2616 outf << " 0 ";
2617 else
2618 outf << " {" << str << "} ";
2621 outf << "];" << endl;
2623 outf << "define SIG" << count << " STEM" << count << " SUF"<< count << ";" << endl ;
2625 outf << "push SIG"<< count << endl;
2627 /* TEMP SOLN: now write cross product in comments */
2628 for ( QStringList::Iterator strIt = stems.begin() ; strIt != stems.end(); ++strIt )
2630 //QList<CSuffix*>::iterator suffix_it = m_SuffixPtrList->begin();
2631 //CSuffix* pSuffix;
2632 //while ( (pSuffix = *suffix_it) != 0 )
2634 for (int i = 0; i < this->GetNumberOfAffixes(); i++)
2636 CSuffix* pSuffix = this->GetSuffix(i);
2637 QString str = pSuffix->Display( 0 );//, m_pMyMini->GetOutFilter() );
2638 if (str.compare("NULL") == 0)
2639 outf << "## "<< *strIt << endl;
2640 else
2641 outf << "## "<< *strIt << str << endl;
2648 //--------------------------------------------------------------------------//
2649 void CSignature::RecalculateStemAndWordPointers()
2650 //--------------------------------------------------------------------------//
2653 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++)
2655 QString stem = GetStem(stemno)->Display();
2656 switch (m_AffixLocation)
2658 case WORD_FINAL:
2659 case STEM_FINAL:
2660 for (int suffixno = 0; suffixno < GetNumberOfAffixes(); suffixno++)
2662 QString suffix = GetSuffix(suffixno)->Display();
2663 if (suffix == "NULL") suffix = "";
2664 QString word = stem + suffix;
2665 CStem* pWord = *GetLexicon()->GetWords() ^= word;
2666 AppendWordPointer( pWord);
2668 break;
2669 case WORD_INITIAL:
2670 case STEM_INITIAL:
2671 for (int prefixno = 0; prefixno < GetNumberOfAffixes(); prefixno++)
2673 QString prefix = GetPrefix(prefixno)->Display();
2674 if (prefix == "NULL") prefix = "";
2675 QString word = prefix + stem;
2676 CStem* pWord = *GetLexicon()->GetWords() ^= word;
2677 AppendWordPointer(pWord);
2680 } // end of stemno loop
2682 //--------------------------------------------------------------------------//