1 // Implementation of CSignature, CSignatureListViewItem methods
2 // Copyright © 2009 The University of Chicago
7 #include "linguisticamainwindow.h"
9 #include "MiniLexicon.h"
10 #include "LPreferences.h"
11 #include "CorpusWord.h"
15 #include "SignatureCollection.h"
16 #include "SuffixCollection.h"
17 #include "PrefixCollection.h"
18 #include "WordCollection.h"
19 #include "StemCollection.h"
20 #include "SparseIntVector.h"
21 #include "CompareFunc.h"
25 #include "implicit_cast.h"
27 bool stemlessthan(const QPair
<CStem
*, int> pair1
, const QPair
<CStem
*, int> pair2
);
29 bool stemlessthan(const QPair
<CStem
*, int> pair1
, const QPair
<CStem
*, int> pair2
)
31 return pair2
.second
< pair1
.second
;
35 //===================================================================================================//
37 // Signature listview item
39 //===================================================================================================//
40 CSignatureListViewItem::CSignatureListViewItem(Q3ListView
*parent
,
41 QString signature
, int mini
, CSignature
* pSig
,
42 QMap
<QString
, QString
>* filter
)
43 : Q3ListViewItem( parent
, signature
)
48 m_parentlist
= parent
;
53 CSignatureListViewItem::CSignatureListViewItem(Q3ListViewItem
*parent
,
54 QString signature
, int mini
, CSignature
* pSig
,
55 QMap
<QString
, QString
>* filter
)
56 : Q3ListViewItem( parent
, signature
)
61 m_parentlist
= parent
->listView();
65 int CSignatureListViewItem::compare(Q3ListViewItem
*item
, int col
, bool asc
) const
70 return MakeComparable ( m_signature
->ComputeDLofModel() , ((CSignatureListViewItem
*) item
)->GetSignature()->ComputeDLofModel() );
74 return MakeComparable ( m_signature
->GetCorpusCount() , ((CSignatureListViewItem
*) item
)->GetSignature()->GetCorpusCount() );
78 return MakeComparable ( m_signature
->GetNumberOfStems() , ((CSignatureListViewItem
*) item
)->GetSignature()->GetNumberOfStems() );
82 return MakeComparable ( ((CSignatureListViewItem
*) item
)->GetSignature()->GetRobustness(), m_signature
->GetRobustness() );
86 return Q3ListViewItem::compare(item
, col
, asc
);
91 QString
CSignatureListViewItem::text( int column
) const
95 CSignatureListViewItem
* child
= NULL
;
104 if( m_signature
&& m_parentlist
->sortColumn() == 0 && m_signature
->GetMentor() )
106 return " : " + m_label
;
110 if( m_signature
&& m_signature
->GetNumberOfStems() > 0 )
112 if (m_signature
->GetNumberOfStems() > 0 ) return m_signature
->GetStem(0)->Display( QChar(0), m_filter
);
116 if( m_signature
) return dummy
.setNum( m_signature
->ComputeDLofModel() );
119 if( m_signature
) return dummy
.setNum ( m_signature
->GetCorpusCount() );
123 child
= (CSignatureListViewItem
*) firstChild();
126 if( child
->GetSignature() )
128 count
+= child
->GetSignature()->GetCorpusCount();
130 child
= (CSignatureListViewItem
*) child
->nextSibling();
132 return dummy
.setNum( count
);
135 if( m_signature
&& m_signature
->GetNumberOfStems() > 0 ) return dummy
.setNum( m_signature
->GetNumberOfStems() );
139 child
= (CSignatureListViewItem
*) firstChild();
142 if( child
->GetSignature() &&
143 child
->GetSignature()->GetNumberOfStems() > 0 )
145 count
+= child
->GetSignature()->GetNumberOfStems();
147 child
= (CSignatureListViewItem
*) child
->nextSibling();
149 return dummy
.setNum( count
);
152 if( m_signature
) return m_signature
->GetRemark();
156 if( m_signature
) return dummy
.setNum( (int) m_signature
->GetRobustness() );
161 child
= (CSignatureListViewItem
*) firstChild();
164 if( child
->GetSignature() &&
165 child
->GetSignature()->GetNumberOfStems() > 0 )
167 count
+= child
->GetSignature()->GetNumberOfStems();
169 child
= (CSignatureListViewItem
*) child
->nextSibling();
171 return dummy
.setNum( count
);
176 return Q3ListViewItem::text( column
);
180 //===================================================================================================//
184 //===================================================================================================//
185 void CSignature::BorrowedSigsDisplay(Q3ListView
* List
,
186 QMap
<QString
, QString
>* filter
)
188 QString source
= "Unknown", dummy
;
189 for (int minino
= 0; minino
< m_pMyMini
->GetMiniSize(); ++minino
) {
190 CMiniLexicon
* mini
= m_pMyMini
->GetMiniLexicon(minino
);
194 CSignatureCollection
& sigs
= *mini
->GetSignatures();
197 source
= dummy
.setNum(minino
+ 1);
202 static_cast<void>(new Q3ListViewItem(
203 List
, Display('.', filter
), source
));
206 //===================================================================================================//
208 // Constructor/destructor
210 //===================================================================================================//
212 CSignature::CSignature( CMiniLexicon
* Lexicon
) : CLParse( Lexicon
)
216 m_StemPtrList
= new QList
<CStem
*>();
217 m_WordPtrList
= new QList
<CStem
*>();
218 m_MentorList
= new QList
<CSignature
*>();
219 m_SuffixPtrList
= new QList
<CSuffix
*>();
220 m_PrefixPtrList
= new QList
<CPrefix
*>();
221 m_SortStyle
= eAlphabetized
;
222 // Description Length
224 m_DLofMyStemPointers
= 0;
225 m_DLofMyAffixPointers
= 0;
226 m_LengthOfPointerToMe
= 0;
227 m_MyGeneralizer
= NULL
;
231 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
235 CSignature::CSignature( eAffixLocation AffixLocation
, CMiniLexicon
* Lexicon
) : CLParse( Lexicon
)
238 m_StemPtrList
= new QList
<CStem
*>();
239 m_WordPtrList
= new QList
<CStem
*>();
240 m_MentorList
= new QList
<CSignature
*>();
241 m_SuffixPtrList
= new QList
<CSuffix
*>();
242 m_PrefixPtrList
= new QList
<CPrefix
*>();
243 m_SortStyle
= eAlphabetized
;
244 m_MyGeneralizer
= NULL
;
245 m_AffixLocation
= AffixLocation
;
248 // Description Length
250 m_DLofMyStemPointers
= 0;
251 m_DLofMyAffixPointers
= 0;
252 m_LengthOfPointerToMe
= 0;
253 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
257 CSignature::CSignature (const CParse
& ParseSig
, CMiniLexicon
* Lexicon
) : CLParse ( ParseSig
, Lexicon
)
260 m_AffixLocation
= Lexicon
->GetAffixLocation();
261 m_StemPtrList
= new QList
<CStem
*>();
262 m_WordPtrList
= new QList
<CStem
*>();
263 m_MentorList
= new QList
<CSignature
*>();
264 m_SuffixPtrList
= new QList
<CSuffix
*>();
265 m_PrefixPtrList
= new QList
<CPrefix
*>();
266 m_SortStyle
= eAlphabetized
;
268 m_MyGeneralizer
= NULL
;
269 // Description Length
271 m_DLofMyStemPointers
= 0;
272 m_DLofMyAffixPointers
= 0;
273 m_LengthOfPointerToMe
= 0;
274 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
278 CSignature::CSignature (const CParse
* pParseSig
, CMiniLexicon
* Lexicon
) : CLParse ( *pParseSig
, Lexicon
)
282 m_StemPtrList
= new QList
<CStem
*>();
283 m_WordPtrList
= new QList
<CStem
*>();
284 m_MentorList
= new QList
<CSignature
*>();
285 m_SuffixPtrList
= new QList
<CSuffix
*>();
286 m_PrefixPtrList
= new QList
<CPrefix
*>();
287 m_SortStyle
= eAlphabetized
;
288 m_MyGeneralizer
= NULL
;
289 m_AffixLocation
= Lexicon
->GetAffixLocation();
291 // Description Length
293 m_DLofMyStemPointers
= 0;
294 m_DLofMyAffixPointers
= 0;
295 m_LengthOfPointerToMe
= 0;
296 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
300 CSignature::CSignature(const CSignature
& Sig
) : CLParse (Sig
, Sig
.GetLexicon())
304 m_AffixLocation
= Sig
.GetAffixLocation();
305 m_Remark
= Sig
.GetRemark();
306 m_pMyMini
= Sig
.GetLexicon();
307 m_MyGeneralizer
= Sig
.GetGeneralizer();
309 int NumberOfStems
= Sig
.GetNumberOfStems();
310 int NumberOfAffixes
= Sig
.Size();
311 int NumberOfWords
= NumberOfStems
*NumberOfAffixes
;
312 QVector
<double> m_WordCounts (NumberOfAffixes
* NumberOfStems
);
313 QVector
<double> m_StemCounts ( NumberOfStems
);
314 QVector
<double> m_AffixCounts( NumberOfAffixes
);
315 QVector
<double> m_WordFrequencies (NumberOfWords
);
316 QVector
<double> m_StemFrequencies (NumberOfStems
);
317 QVector
<double> m_AffixFrequencies (NumberOfAffixes
);
318 m_TotalCount
= Sig
.GetTotalCount();
320 m_StemPtrList
= new QList
<CStem
*>();
321 for ( stemno
= 0; stemno
< NumberOfStems
; stemno
++)
323 AppendStemPtr( Sig
.GetStem(stemno
));
324 m_StemCounts
[stemno
] = Sig
.GetStemCount(stemno
);
325 m_StemFrequencies
[stemno
] = Sig
.GetStemFrequency(stemno
);
327 if (m_AffixLocation
== WORD_FINAL
|| m_AffixLocation
== STEM_FINAL
) {
328 m_SuffixPtrList
= new QList
<CSuffix
*>();
329 for ( affixno
= 0; affixno
< NumberOfAffixes
; affixno
++)
331 AppendSuffixPtr ( Sig
.GetSuffix(affixno
) );
332 m_AffixCounts
[affixno
] = Sig
.GetAffixCount(affixno
);
333 m_AffixFrequencies
[affixno
] = Sig
.GetAffixFrequency(affixno
);
336 if (m_AffixLocation
== WORD_INITIAL
|| m_AffixLocation
== STEM_INITIAL
) {
337 m_PrefixPtrList
= new QList
<CPrefix
*>();
338 for ( affixno
= 0; affixno
< NumberOfAffixes
; affixno
++)
340 AppendPrefixPtr ( Sig
.GetPrefix(affixno
) );
341 m_AffixCounts
[affixno
] = Sig
.GetAffixCount(affixno
);
342 m_AffixFrequencies
[affixno
] = Sig
.GetAffixFrequency(affixno
);
347 m_WordPtrList
= new QList
<CStem
*>();
348 for (stemno
= 0; stemno
< NumberOfStems
; stemno
++) {
349 for (affixno
= 0; affixno
< NumberOfAffixes
; affixno
++) {
350 SetWordCount(stemno
, affixno
, 0);
351 AppendWordPointer (Sig
.GetWord(stemno
, affixno
));
355 m_Robustness
= Sig
.GetRobustness();
357 m_SortStyle
= eAlphabetized
;
358 m_MentorList
= new QList
<CSignature
*>();
367 CSignature::CSignature(const CStringSurrogate
& ssSig
, CMiniLexicon
* Lexicon
) : CLParse(ssSig
, Lexicon
)
369 Collapse( ssSig
, '.');
372 m_StemPtrList
= new QList
<CStem
*>();
373 m_WordPtrList
= new QList
<CStem
*>();
374 m_MentorList
= new QList
<CSignature
*>();
375 m_SuffixPtrList
= new QList
<CSuffix
*>();
376 m_PrefixPtrList
= new QList
<CPrefix
*>();
377 m_SortStyle
= eAlphabetized
;
378 m_MyGeneralizer
= NULL
;
379 // Description Length
381 m_DLofMyStemPointers
= 0;
382 m_DLofMyAffixPointers
= 0;
383 m_LengthOfPointerToMe
= 0;
385 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
389 m_SortStyle
= eAlphabetized
;
393 CSignature::~CSignature()
396 if( m_StemPtrList
) delete m_StemPtrList
;
397 if( m_WordPtrList
) delete m_WordPtrList
;
398 if( m_MentorList
) delete m_MentorList
;
399 if( m_SuffixPtrList
) delete m_SuffixPtrList
;
400 if( m_PrefixPtrList
) delete m_PrefixPtrList
;
402 //===================================================================================================//
406 //===================================================================================================//
407 QString
CSignature::Display(QChar sep
, QMap
<QString
, QString
>* filter
) const
411 sd
= m_pMyMini
->GetDocument()->GetPreferences()
412 ->GetPreference("Sig_Delimiter");
416 return CParse::Display(sd
.at(0), filter
);
419 QString
CSignature::Display(QMap
<QString
, QString
>* filter
) const
420 { return CParse::Display(filter
); }
422 QString
CSignature::Display() const
423 { return CParse::Display('.'); }
425 //===================================================================================================//
429 //===================================================================================================//
431 void CSignature::ConsumeParse( CParse
* pParse
)
435 CopyParseStructure( *pParse
);
439 void CSignature::Suicide()
441 //TODO: fill this in;
443 void CSignature::SetMyGeneralizer (CSignature
* pSig
)
445 m_MyGeneralizer
= pSig
;
447 //===================================================================================================//
451 //===================================================================================================//
452 void CSignature::operator=(const CSignature
* pSig
)
454 m_pMyMini
= pSig
->GetMyMini();
455 CLParse::operator=(*pSig
);
456 m_AffixLocation
= pSig
->GetAffixLocation();
458 int NumberOfStems
= pSig
->GetNumberOfStems();
459 int NumberOfAffixes
= pSig
->GetNumberOfAffixes();
460 int NumberOfWords
= NumberOfStems
*NumberOfAffixes
;
461 m_StemCounts
.resize(NumberOfStems
);
462 m_WordCounts
.resize(NumberOfWords
);
463 m_AffixCounts
.resize(NumberOfAffixes
);
465 m_StemCounts
.resize(NumberOfStems
);
466 for (int stemno
= 0; stemno
< pSig
->GetNumberOfStems(); stemno
++) {
467 m_StemPtrList
->append ( pSig
->GetStem(stemno
) );
468 m_StemCounts
[stemno
]=pSig
->GetStemCount(stemno
);
469 for (int affixno
= 0; affixno
< pSig
->GetNumberOfAffixes(); affixno
++)
471 m_WordPtrList
->append ( pSig
->GetWord(stemno
, affixno
));
472 SetWordCount(stemno
, affixno
, pSig
->GetWordCount(stemno
, affixno
));
476 if (m_AffixLocation
== WORD_FINAL
|| m_AffixLocation
== STEM_FINAL
) {
477 for (int suffixno
= 0; suffixno
< pSig
->GetNumberOfAffixes(); suffixno
++)
479 m_SuffixPtrList
->append ( pSig
->GetSuffix(suffixno
) );
480 m_AffixCounts
[suffixno
] = pSig
->GetAffixCount(suffixno
);
483 for (int prefixno
= 0; prefixno
< GetNumberOfAffixes(); prefixno
++) {
484 m_PrefixPtrList
->append(pSig
->GetPrefix(prefixno
) );
485 m_AffixCounts
[prefixno
] = pSig
->GetAffixCount(prefixno
);
491 m_Robustness
= pSig
->GetRobustness();
493 m_Remark
= pSig
->GetRemark();
497 QTextStream
& operator<< (QTextStream
& stream
, CSignature
* pSig
)
501 stream
<< endl
<< pSig
->Display();
503 stream
<< pSig
-> GetNumberOfStems() << " " << pSig
->GetCorpusCount();
505 for (int stemno
= 0; stemno
< pSig
->GetNumberOfStems(); stemno
++)
507 pStem
= pSig
->GetStem(stemno
);
508 if ( pStem
->GetKey() != CStringSurrogate() )
512 stream
<< pStem
->GetKey().Display();
524 // <<-------------------------------------------------------------------------------------------------------->>
525 void CSignature::operator<< (CStem
* pStem
) //add to tail of list.
530 if ( m_StemPtrList
->indexOf ( pStem
) < 0 )
532 Q_ASSERT (pStem
->GetKeyLength() > 0);
533 m_StemPtrList
->append(pStem
);
536 Q_ASSERT ( m_PieceCount
<= m_LengthOfPieceVector
) ;
538 for (int wordno
= 0; wordno
< pStem
->GetWordPtrList()->size(); wordno
++)
540 pWord
= pStem
->GetWord(wordno
);
541 Q_ASSERT (pWord
->GetKeyLength() > 0);
542 m_WordPtrList
->append (pWord
);
544 pStem
->SetSuffixSignature (this);
547 m_Robustness
= GetRobustness();
550 //===================================================================================================//
552 // Accessors and setters
554 //===================================================================================================//
555 CSignature
* CSignature::GetMentor ( ) { return m_Mentor
; }
556 // <<-------------------------------------------------------------------------------------------------------->>
557 void CSignature::SetMentor ( CSignature
* pSig
)
560 if( pSig
&& pSig
->GetMentorList() && pSig
->GetMentorList()->indexOf (this) < 0) {
561 pSig
->GetMentorList()->append( this );
566 int CSignature::GetNumberOfAffixes() const
569 if ( m_AffixLocation
== STEM_FINAL
|| m_AffixLocation
== WORD_FINAL
)
571 return m_SuffixPtrList
->count();
573 if ( m_AffixLocation
== STEM_INITIAL
|| m_AffixLocation
== WORD_INITIAL
)
575 return m_PrefixPtrList
->count();
581 void CSignature::AppendSuffixPtr (CSuffix
* pSuffix
) { m_SuffixPtrList
->append(pSuffix
);}
582 QList
<CSignature
*>* CSignature::GetMentorList( ) { return m_MentorList
; }
583 int CSignature::GetNumberOfStems() const { return m_StemPtrList
->count(); }
584 //int CSignature::GetNumberOfSuffixes () const { return m_SuffixPtrList->count(); }
585 void CSignature::SetRemark ( QString remark
) { m_Remark
= remark
; }
586 CPrefix
* CSignature::GetPrefix(int prefixno
) const { return m_PrefixPtrList
->at(prefixno
); }
587 QList
<CPrefix
*>* CSignature::GetPrefixPtrList() const { return m_PrefixPtrList
; }
588 QString
CSignature::GetRemark() const { return m_Remark
; }
589 QList
<CStem
*>* CSignature::GetStemPtrList() const { return m_StemPtrList
;}
590 CStem
* CSignature::GetStem(int stemno
) const { return m_StemPtrList
->at(stemno
); }
591 CSuffix
* CSignature::GetSuffix(int suffixno
) const { return m_SuffixPtrList
->at(suffixno
); }
592 QList
<CSuffix
*>* CSignature::GetSuffixPtrList() const { return m_SuffixPtrList
; }
593 int CSignature::GetTotalCount() const { return m_TotalCount
; }
594 double CSignature::GetCorpusCount() const { return corpus_count::GetCorpusCount();}
595 float CSignature::GetSortingQuantity() const { return (float) GetRobustness();}
597 bool CSignature::StemListContains(CStem
* pstem
) { return m_StemPtrList
->contains(pstem
); }
598 void CSignature::AppendStemPtr(CStem
* pStem
) const { m_StemPtrList
->append(pStem
);}
601 eAffixLocation
CSignature::GetAffixLocation() const { return m_AffixLocation
; }
602 // <<-------------------------------------------------------------------------------------------------------->>
603 CStem
* CSignature::GetWord(int stemno
, int affixno
) const
605 if (stemno
< 0 || affixno
< 0 || stemno
>= GetNumberOfStems() || affixno
>= GetNumberOfAffixes())
607 if (stemno
* GetNumberOfAffixes() + affixno
>= m_WordPtrList
->size() )
609 return m_WordPtrList
->at(stemno
* GetNumberOfAffixes() + affixno
);
611 CParse
CSignature::GetStems()
615 if ( m_StemPtrList
->count() == 0 ) { return List
; } // ********** This is clearly a mistake. Fix it.
617 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
619 List
.Append( GetStem(stemno
)->GetKey() );
623 // <<-------------------------------------------------------------------------------------------------------->>
632 // <<-------------------------------------------------------------------------------------------------------->>
633 double CSignature::GetStemFrequency(int stemno
) const {
634 if (stemno
< 0 || stemno
> GetNumberOfStems() ) return 0;
635 return m_StemFrequencies
[stemno
];
638 // <<-------------------------------------------------------------------------------------------------------->>
639 double CSignature::GetAffixFrequency(int affixno
) const {
640 if (affixno
< 0 || affixno
> GetNumberOfAffixes() ) {return 0; }
641 return m_AffixFrequencies
[affixno
];
643 // <<-------------------------------------------------------------------------------------------------------->>
644 double CSignature::GetStemCount(int stemno
) const {
645 if (stemno
< 0 || stemno
> GetNumberOfStems() ){ return 0; }
646 return m_StemCounts
[stemno
];
648 // <<-------------------------------------------------------------------------------------------------------->>
650 double CSignature::GetAffixCount(int affixno
) const
651 { if (affixno
< 0 || affixno
> GetNumberOfAffixes() ) return 0;
652 return m_AffixCounts
[affixno
];
654 // <<-------------------------------------------------------------------------------------------------------->>
655 double CSignature::GetWordCount(int wordno
)const {
656 if (wordno
< 0 || wordno
> GetNumberOfWords() ) { return 0;}
657 return m_WordCounts
[wordno
]; }
658 // <<-------------------------------------------------------------------------------------------------------->>
660 //===================================================================================================//
662 // Calculate frequencies and counts
664 //===================================================================================================//
665 void CSignature::CalculateFrequencies(CMiniLexicon
* Lexicon
)
667 CStringSurrogate Suffix
;
670 CCorpusWord
* pCorpusWord
;
671 Q_ASSERT( GetCorpusCount() > 0);
672 int TotalCorpusCount
= 0;
673 int* SuffixCount
= new int [ Size()+ 1 ];
674 for (int suffixno
= 1; suffixno
<= Size(); ++suffixno
)
675 { SuffixCount
[suffixno
] = 0; }
677 for (int suffixno
= 1; suffixno
<= Size(); suffixno
++)
679 Suffix
= GetPiece(suffixno
);
680 pSuffix
= new CSuffix(Suffix
);
682 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
684 pStem
= GetStem(stemno
);
685 pCorpusWord
= Lexicon
->FindAWord (pStem
, pSuffix
);
686 if( pCorpusWord
) // might not exist if we have collapsed signatures.
688 TotalCorpusCount
+= pCorpusWord
->GetCorpusCount();
689 SuffixCount
[suffixno
] += pCorpusWord
->GetCorpusCount();
694 delete [] SuffixCount
;
697 // <<-------------------------------------------------------------------------------------------------------->>
698 void CSignature::ListDisplay(Q3ListView
* List
,
699 QMap
<QString
, QString
>* filter
, bool ExpressDeletees
)
701 CSignature
sig(m_pMyMini
);
702 Express(sig
, ExpressDeletees
);
703 QString text
= sig
.Display('.', filter
);
705 static_cast<void>(new CSignatureListViewItem(
706 List
, text
, m_pMyMini
->GetIndex(), this, filter
));
709 // <<-------------------------------------------------------------------------------------------------------->>
710 void CSignature::FindCorpusCount( )
712 SetCorpusCount ( 0 );
713 for (int stemno
=0; stemno
< GetNumberOfStems(); stemno
++) {
714 for (int affixno
= 0; affixno
< GetNumberOfAffixes(); affixno
++)
715 IncrementCorpusCount ( GetWord(stemno
, affixno
)->GetCorpusCount() );
718 // <<-------------------------------------------------------------------------------------------------------->>
719 void CSignature::AttachToSuffixSig(CStem
* pStem
, bool bLookAtPreviousSig
) //add to tail of list.
722 int numberofaffixes
= GetNumberOfAffixes();
724 CSignature
* pOldSig
= pStem
->GetSuffixSignature();
725 QString stem
= pStem
->Display();
727 /* First, remove pStem from any other SuffixSignature it might be linked to.*/
728 if ( pOldSig
&& pOldSig
!= this ) {
729 pOldSig
->DetachStem( pStem
, eDo_Not_Call_Words
);
730 pOldSig
->RecalculateStemAndWordPointers();
733 stemno
= m_StemPtrList
->indexOf ( pStem
);
735 m_StemPtrList
->append( pStem
);
736 stemno
= GetNumberOfStems()-1;
739 switch( m_AffixLocation
){
742 for (int affixno
= 0; affixno
< numberofaffixes
; affixno
++)
744 pWord
= GetLexicon()->GetWordFromStemSuffix(pStem
, GetSuffix(affixno
));
747 AppendWordPointer( pWord
);
748 pWord
->SetSuffixSignature (this);
752 AppendWordPointer(NULL
);
758 for (int prefixno
= 0; prefixno
< numberofaffixes
; prefixno
++)
760 pWord
= GetLexicon()->GetWordFromStemPrefix(pStem
, GetPrefix(prefixno
));
763 AppendWordPointer( pWord
);
764 pWord
->SetPrefixSignature (this);
768 AppendWordPointer(NULL
);
774 pStem
->SetSuffixSignature( this );
775 IncrementCorpusCount( pStem
->GetCorpusCount()-1 );// first time CC is incremented
778 m_Robustness
= GetRobustness();
780 // <<-------------------------------------------------------------------------------------------------------->>
781 void CSignature::AttachToPrefixSig( CStem
* pStem
, bool bLookAtPreviousSig
) //add to tail of list.
784 CSignature
* pOldSig
= pStem
->GetPrefixSignature();
786 /* First, remove pStem from any other PrefixSignature it might be linked to.*/
787 if ( pOldSig
&& pOldSig
!= this ) {
788 pOldSig
->DetachStem( pStem
, eDo_Not_Call_Words
);
789 RecalculateStemAndWordPointers();
792 if( m_StemPtrList
->indexOf ( pStem
) < 0 ) {
793 AppendStemPtr( pStem
);
796 // move the Words from the old signature to this, the new one.
798 for (int wordno
= 0; wordno
< pStem
->GetNumberOfWords(); wordno
++) {
799 pWord
= pStem
->GetWord(wordno
);
800 m_WordPtrList
->append (pWord
);
801 pWord
->SetPrefixSignature (this);
806 pStem
->SetPrefixSignature( this );
807 IncrementCorpusCount( pStem
->GetCorpusCount()-1 );
808 m_Robustness
= GetRobustness();
811 // <<-------------------------------------------------------------------------------------------------------->>
812 double CSignature::GetRobustness() const
814 int SuffixLetters
= 0,
817 if (m_Robustness
== 0)
819 SuffixLetters
= GetKeyLength();
820 QString Null
= "NULL";
821 if ( Contains( CStringSurrogate(Null
.unicode(),0,Null
.length()) ) ) { SuffixLetters
-= 4; }
824 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++) {
825 pStem
= GetStem(stemno
);
826 StemLetters
+= pStem
->GetKeyLength();
829 m_Robustness
= ( Size() - 1 ) * StemLetters
+ (GetNumberOfStems() - 1) * SuffixLetters
;
834 // <<-------------------------------------------------------------------------------------------------------->>
835 void CSignature::SetRobustness ( double R
) { m_Robustness
= R
; }
836 // <<-------------------------------------------------------------------------------------------------------->>
838 // the counts of each individual word analyzed by this signature.
839 //double* CSignature::GetWordCounts() const { return m_WordCounts;
841 // <<-------------------------------------------------------------------------------------------------------->>
842 double CSignature::GetWordCount(int stemno
, int affixno
) const
844 if ( stemno
< 0 || affixno
< 0 || stemno
>= GetNumberOfStems() || affixno
>= GetNumberOfAffixes() ) return 0;
845 return m_WordCounts
[stemno
* GetNumberOfStems() + affixno
];
847 // <<-------------------------------------------------------------------------------------------------------->>
848 void CSignature::SetWordCount (int stemno
, int affixno
, double value
)
850 if ( stemno
< 0 || affixno
< 0 || stemno
>= GetNumberOfStems() || affixno
>= GetNumberOfAffixes() )
852 m_WordCounts
[stemno
* GetNumberOfAffixes() + affixno
] = value
;
857 // <<-------------------------------------------------------------------------------------------------------->>
859 void CSignature::CalculateWordCounts()
862 int numberofstems
= GetNumberOfStems();
863 int numberofaffixes
= GetNumberOfAffixes();
868 m_WordCounts
.clear();
869 m_WordCounts
.resize(numberofstems
*numberofaffixes
);
870 m_StemCounts
.clear();
871 m_StemCounts
.resize(numberofstems
);
872 m_AffixCounts
.clear();
873 m_AffixCounts
.resize(numberofaffixes
);
875 for (int affixno
= 0; affixno
< numberofaffixes
; affixno
++) { m_AffixCounts
[affixno
] = 0; }
876 for (int stemno
= 0; stemno
< numberofstems
; stemno
++) { m_StemCounts
[stemno
] = 0; }
880 for (int stemno
= 0; stemno
< numberofstems
; stemno
++)
882 for ( int affixno
= 0; affixno
< numberofaffixes
; affixno
++)
884 pWord
= GetWord(stemno
, affixno
);
885 count
= pWord
->GetCorpusCount();
886 // SetWordCount (stemno, affixno, count);
887 // m_StemCounts[stemno] = m_StemCounts[stemno] + count;
888 // m_AffixCounts[affixno] = m_AffixCounts[affixno] + count;
889 // m_TotalCount += count;
893 if (m_TotalCount <= 0) return;
895 m_WordFrequencies.resize(numberofstems*numberofaffixes);
896 m_StemFrequencies.resize(numberofstems);
897 m_AffixFrequencies.resize(numberofaffixes);
899 for ( int stemno = 0; stemno < numberofstems; stemno++)
901 m_StemFrequencies[stemno] = m_StemCounts[stemno]/m_TotalCount;
902 for ( affixno = 0; affixno < numberofaffixes; affixno++)
904 wordno = stemno * numberofaffixes + affixno;
905 m_WordFrequencies[wordno] = GetWordCount(stemno, affixno) / m_TotalCount;
909 for (int affixno = 0; affixno < numberofaffixes; affixno++){
910 m_AffixFrequencies[affixno] = m_AffixCounts[affixno] / m_TotalCount;
916 //=================================================================================================/
918 // TODO: make sure COST function is consistent with older versions and working right
919 double CSignature::FindCost(CMiniLexicon
* Lexicon
)
921 //=================================================================================================/
927 Sum over all of its stems :
929 log ( CorpusSize / Stem-count ) ( cost )
930 length ( stem ) * cost of a letter ( savings )
932 Sum over all of its suffixes:
934 log ( CorpusSize / suffix-count ) ( cost )
935 length ( suffix ) * cost of a letter ( savings )
945 CostOfALetter
= base2log (26),
947 NumberOfWords
= Lexicon
->GetWords()->GetCount();
951 for (int affixno
= 1; affixno
<= Size(); affixno
++)
953 if( m_AffixLocation
== WORD_FINAL
|| m_AffixLocation
== STEM_FINAL
)
955 pAffix
= *Lexicon
->GetSuffixes() ^= GetPiece(affixno
);
959 pAffix
= *Lexicon
->GetPrefixes() ^= GetPiece(affixno
);
962 if ( pAffix
) // it already exists
964 ThisAffixCost
= base2log ( NumberOfWords
/ pAffix
->GetUseCount() );
968 ThisAffixCost
= base2log ( NumberOfWords
/GetNumberOfStems() );
969 ThisAffixCost
+= GetPiece(affixno
).GetLength() * CostOfALetter
;
971 AffixCost
+= ThisAffixCost
;
973 AffixSavings
+= GetPiece(affixno
).GetLength() * CostOfALetter
;
975 SignatureCost
+= ThisAffixCost
;
979 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
981 pStem
= m_StemPtrList
->at(stemno
);
982 StemCost
+= base2log ( NumberOfWords
/ Size() ); // Size is the number of words that use stem, of course.
983 StemCost
+= pStem
->GetKeyLength() * CostOfALetter
;
984 StemSavings
+= pStem
->GetKeyLength() * CostOfALetter
* Size(); // save for each time stem appears, with each suffix
985 SignatureCost
+= StemCost
;
988 Cost
= AffixCost
+ StemCost
- AffixSavings
- StemSavings
+ SignatureCost
;
994 // <<-------------------------------------------------------------------------------------------------------->>
996 void CSignature::OutputSignature( QTextStream
& outf
)
1004 outf
<< " ------------------------------------------------------------------------------------------ " << endl
;
1005 outf
<< Display( '.', m_pMyMini
->GetOutFilter() );
1006 outf
<< endl
<< " ------------------------------------------------------------------------------------------ " << endl
;
1013 outf
<< "Number of stems: ";
1014 outf
<< QString("%1").arg( (int) GetNumberOfStems() );
1016 outf
<< " Corpus count: ";
1017 outf
<< QString("%1").arg( GetCorpusCount() );
1021 outf
<< GetRemark().replace( QChar(' '), "_" );
1024 outf
<< "Number of affixes: ";
1025 outf
<< GetNumberOfAffixes();
1026 outf
<< " Word Pointer List length: ";
1027 outf
<< m_WordPtrList
->count();
1033 CalculateWordCounts();
1037 outf
.setFieldAlignment( QTextStream::AlignLeft
);
1038 QList
< QPair
<CStem
*, int> > pstems
;
1039 for (int stemno
=0; stemno
< GetNumberOfStems(); stemno
++ )
1041 pStem
= GetStem(stemno
);
1042 pstems
.append( qMakePair(pStem
, pStem
->GetCorpusCount() ) );
1043 if (pStem
->GetKeyLength() > maxlength
) { maxlength
= pStem
->GetKeyLength();}
1045 qSort(pstems
.begin(), pstems
.end(), stemlessthan
);
1047 outf
<< "Sorted by stem frequency: " << endl
<< endl
;
1048 outf
<< "# Rank | Stem | Words .... " << endl
;
1049 outf
<< "# ------------------------------------------------------------------------------------------ " << endl
;
1052 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
1056 pStem
= pstems
[stemno
].first
;
1059 outf
. width( maxlength
+ 5);
1060 outf
<< pStem
->Display();
1062 outf
<< pstems
[stemno
].second
;
1065 outf
<< endl
<< "# ------------------------------------------------------------------------------------------ " << endl
;
1066 outf
<< endl
<< endl
<<"Display all words with counts: " << endl
;
1067 outf
<< "# ------------------------------------------------------------------------------------------ " << endl
;
1069 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
1071 for ( int affixno
= 0; affixno
< GetNumberOfAffixes(); affixno
++)
1073 pWord
= GetWord(stemno
, affixno
);
1076 outf
.setFieldWidth (maxlength
+ 5); outf
<< pWord
->Display();
1077 outf
.setFieldWidth (5) ; outf
<< string
.setNum( pWord
->GetCorpusCount() );
1082 outf
<< endl
<< endl
;
1088 /* This purpose of this function is to take a signature of the form A.SUFFIX
1089 and make it NULL.SUFFIX (the pAlternateSig), and move that letter A back onto its stems.
1092 // <<-------------------------------------------------------------------------------------------------------->>
1093 void CSignature::RemoveLetter (CStringSurrogate
& ssLetter
, CMiniLexicon
* Lexicon
, CSignature
* pAlternateSig
)
1102 QString OldKey
= Display();
1103 CStringSurrogate ssSuffix
,
1106 CSignature
NewSig ( WORD_FINAL
, Lexicon
);
1107 int LetterLength
= ssLetter
.GetLength();
1109 CSignature
*qSig
= NULL
,
1115 QMap
<QString
,CSuffix
*> SuffixPtrTranslation
;
1117 /* Create the NewSig */
1118 for (int affixno
= 1; affixno
<= Size(); affixno
++)
1120 ssSuffix
= GetPiece(affixno
);
1121 if(!NewSig
.GetSortStyle()== eAlphabetized
) NewSig
.Alphabetize();
1122 if ( ssSuffix
== ssLetter
)
1124 if(!NewSig
.GetSortStyle()==eAlphabetized
) NewSig
.Alphabetize();
1125 NewSig
.Append ( CStringSurrogate(Null
.unicode(),0,Null
.length()) );
1129 QString lt_brak
= "<", rt_brak
= ">";
1131 PSuffix
= CStringSurrogate(lt_brak
.unicode(),0,1);
1132 PSuffix
+= ssLetter
;
1133 PSuffix
+= CStringSurrogate(rt_brak
.unicode(),0,1);
1134 PSuffix
+= ssSuffix
;
1136 pSuffix
= *Lexicon
->GetSuffixes() << PSuffix
;
1138 Suffix
= "<" + ssLetter
.Display() + ">" + ssSuffix
.Display();
1139 SuffixPtrTranslation
[ ssSuffix
.Display() ] = pSuffix
; // based on old suffix
1140 // SuffixStringTranslation[ ssSuffix.Display() ] = Suffix;
1142 NewSig
.Append ( PSuffix
.GetKey() );
1146 /* Change the KEY of this signature */
1149 QString remark
= GetRemark() + " +allomorphy";
1150 SetRemark ( remark
);
1152 //-----------------------------------------------------------//
1153 // Change the signature, the stems, the words -- and the suffixes.
1154 //-----------------------------------//
1155 /* Deal with the stems */
1156 //-----------------------------------//
1159 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
1161 CStem
* pStem
= m_StemPtrList
->at(stemno
);
1162 ssStem
= pStem
->GetKey();
1163 PNewStem
= ssStem
+ ssLetter
;
1164 qStem
= *Lexicon
->GetStems() ^= PNewStem
;
1166 if (qStem
) // -- if the larger one already existed
1168 pOlderSig
= *Lexicon
->GetSignatures() ^= qStem
->GetSuffixList();
1170 // this removes both stem and word from signature:
1171 pOlderSig
-> DetachStem ( qStem
, eCall_Words
); // we might want to eliminate this sig if it has no more stems
1173 qStem
-> GetSuffixList()->MergeAndAlphabetizeParse( CParse(NewSig
) );
1175 qSig
= *Lexicon
->GetSignatures() << qStem
->GetSuffixList();
1177 // attaches both stems and words to qSig
1178 qSig
-> AttachToSuffixSig(qStem
, false);
1181 else // make the old stem into this new one
1183 pStem
-> RepairSuffixList ( Lexicon
);
1184 Lexicon
-> GetStems()-> SetKey( pStem
, PNewStem
);
1185 pStem
-> SetKey( PNewStem
);
1189 Q_ASSERT(m_StemPtrList
->size() > 0);
1190 CStem
* pStem
= m_StemPtrList
->at(m_StemPtrList
->size() - 1);
1192 //---------------------------------------------//
1193 /* Deal with the WORDs of this signature */
1194 //---------------------------------------------//
1196 for (int wordno
= 0; wordno
< m_WordPtrList
->size(); wordno
++)
1198 pWord
= m_WordPtrList
->at(wordno
);
1199 pNewSuffix
= SuffixPtrTranslation
[ pWord
->GetSuffix().Display() ];
1200 pWord
-> ShiftStemSuffixBoundary ( LetterLength
);
1202 pWord
-> SetSuffixPtr ( pNewSuffix
);
1203 pWord
-> AttachWordAndSuffixalStem ( pStem
);
1204 pWord
-> SetSuffixSignature ( this );
1208 //------------------------------------------------------------//
1210 //------------------------------------------------------------//
1211 /* Shift stems from AlternateSig to the NewSig, but NOT
1212 if the stem ends with Letter; if it does, we'll
1213 keep the old signature with that stem.
1216 This will replace some or all of pAlternateSig --
1217 "some" when there are any stems that don't allow removal of the Letter.
1218 For example, NULL.ing will not disappear when <e>ing.NULL is created,
1219 because the stem "be" still requires NULL.ing --
1222 // Deal with stems in AlternateSig....
1224 for (int stemno
= 0; stemno
< pAlternateSig
->GetNumberOfStems(); stemno
++)
1226 pStem
= pAlternateSig
->GetStem(stemno
);
1227 ssStem
= pStem
->GetKey();
1228 if ( ssStem
.Right(LetterLength
) == ssLetter
)
1231 pAlternateSig
->DetachStem( pStem
, eCall_Words
);
1232 AttachToSuffixSig( pStem
, false );
1234 // Deal with Words in Alternate signature
1236 for (int stemno
= 0; stemno
< pAlternateSig
->GetNumberOfStems(); stemno
++)
1238 pWord
= pAlternateSig
->GetStem(stemno
);
1239 pNewSuffix
= SuffixPtrTranslation
[ pWord
->GetSuffix().Display() ];
1241 pWord
-> SetSuffixPtr ( pNewSuffix
);
1242 pWord
-> AttachWordAndSuffixalStem ( pStem
);
1243 pWord
->SetSuffixSignature ( this );
1246 //------------------------------------------------------------//
1248 /* Get rid of the Alternate Sig ("NULL.ing" ) */
1250 if ( pAlternateSig
->GetNumberOfStems() == 0 )
1252 Lexicon
->GetSignatures()->Remove( pAlternateSig
);
1257 // <<------------------------------------------------------------------------>>
1258 bool CSignature::EachSuffixCanHaveThisLetterPrefixedToIt ( const QString
& Letter
)
1261 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1262 Suffix
= GetPiece(affixno
).Display();
1263 if (Suffix
== "NULL" ) { Suffix
= ""; }
1264 Suffix
= Letter
+ Suffix
;
1265 if(0)// TODO: if ( ! (*Lexicon->GetSuffixes() ^= Suffix ) )
1273 // <<------------------------------------------------------------------------>>
1274 void CSignature::ShiftStemSuffixCutToTheLeft(int Distance
,
1275 const QString
& Piece
)
1277 struct not_implemented
{ };
1278 throw not_implemented();
1280 // XXX. suppresses “unused parameter” warnings
1281 static_cast<void>(Distance
);
1282 static_cast<void>(Piece
);
1284 foreach (CStem
* word
, *m_WordPtrList
) {
1285 word
->ShiftStemSuffixBoundary(-1);
1286 Q_ASSERT(word
->GetStemLoc() != 0);
1289 foreach (CStem
* stem
, *m_StemPtrList
) {
1290 CStringSurrogate stem_text
= stem
->GetKey();
1292 stem
->SetKey(stem_text
.Left(stem_text
.GetLength() - 1));
1294 // XXX. Check to see if the new stem already exists.
1295 // Lexicon->GetStems()->GetHash()->RemoveKey ( Stem );
1296 // Lexicon->GetStems()->GetHash()->SetAt( NewStem, pStem );
1297 // Lexicon->GetStems()->SetKey( pStem, NewStem );
1300 // XXX. fix the signature
1301 // AddLetter ( 1, Piece );
1303 // Lexicon->AddToScreen ( Display() );
1306 // Variant in which the shifted string varies from stem to stem.
1307 void CSignature::ShiftStemSuffixCutToTheLeft(int Distance
)
1309 // XXX. suppresses “unused parameter” warning
1310 static_cast<void>(Distance
);
1311 struct not_implemented
{ };
1312 throw not_implemented();
1314 // first, fix the words;
1315 foreach (CStem
* word
, *m_WordPtrList
) {
1316 word
->ShiftStemSuffixBoundary(-1);
1317 Q_ASSERT(word
->GetStemLoc() != 0);
1320 // XXX. fix the signature
1321 // AddLetter ( 1, Piece );
1323 // Lexicon->AddToScreen ( Display() );
1326 void CSignature::AddLetter(const QString
& Letter
)
1328 PrefixToAllPieces ( CStringSurrogate(Letter
.unicode(),0,Letter
.length() ) );
1332 /// Looks at the final ngrams of the stems, and calculates its entropy
1333 double CSignature::ComputeFinalNgramEntropyOfStems(int n
)
1335 TCollection
<CLParse
> Ngrams
;
1336 foreach (CStem
* pStem
, *m_StemPtrList
) {
1337 if (pStem
->GetKeyLength() <= n
)
1341 CStringSurrogate ssPiece
= pStem
->GetKey();
1342 ssPiece
= is_initial(GetAffixLocation()) ?
1343 ssPiece
.Left(n
) : ssPiece
.Right(n
);
1347 double Entropy
= 0.0;
1348 const double StemCount
= GetNumberOfStems();
1349 const int ngram_count
= Ngrams
.GetCount();
1350 for (int i
= 0; i
< ngram_count
; ++i
) {
1351 const double fraction
= StemCount
/ Ngrams
[i
]->GetCorpusCount();
1352 Entropy
+= log2(fraction
) / fraction
;
1357 //===================================================================================================//
1359 // CHECK OUT: major function
1361 //===================================================================================================//
1362 /// Test to see whether the break with its stems is a good one.
1363 int CSignature::CheckOut(CMiniLexicon
* Lexicon
)
1365 using linguistica::implicit_cast
;
1366 // Throughout, “DL” stands for “description length”.
1367 Lexicon
->LogFileSmallTitle("Empirical signature: "+ Display() );
1368 if (Lexicon
->LogFileOn()) {
1369 // dump stem list to log file.
1370 Lexicon
->LogFileStartTable();
1371 Lexicon
->LogFileStartRow();
1372 const int num_columns
= 5;
1375 CParse Stems
= GetStems();
1376 for (int stemno
= 1; stemno
<= GetNumberOfStems(); ++stemno
) {
1377 if (stemno
% num_columns
== 0) {
1378 Lexicon
->LogFileEndRow(); Lexicon
->LogFileStartRow();
1380 Lexicon
->LogFileSimpleString( Stems
[stemno
].Display()); //JG June 2010
1382 Lexicon
->LogFileEndRow(); Lexicon
->LogFileEndTable();
1383 } // end of logfile on
1384 Lexicon
->LogFileHeader("Number of letters","Entropy", "Resolution?" );
1385 bool LowEntropyFlag
= false;
1386 int LargestSizeChunkToPullOffStem
= 0;
1387 // Use entropy to see how many letters to consider shifting
1388 // XXX. Make this user-changeable.
1389 const double EntropyThreshold
= 1.5;
1390 const int LengthToConsiderShifting
= 4;
1391 for (int n
= 1; n
<= LengthToConsiderShifting
; ++n
) {
1392 const double Entropy
= ComputeFinalNgramEntropyOfStems(n
);
1395 // Negative entropy:
1396 // stem too short to consider shortening.
1397 Lexicon
->LogFile("", "", "No reanalysis");
1401 if (Entropy
>= EntropyThreshold
) {
1402 Lexicon
->LogFileSimpleString ("");
1403 Lexicon
->LogFileSimpleDouble(Entropy
);
1404 Lexicon
->LogFileSimpleString("Entropy too large.");
1408 // set of n-suffixes of stems has low entropy:
1409 // maybe stems have a common suffix that should be
1410 // incorporated into the signature.
1411 LowEntropyFlag
= true;
1412 LargestSizeChunkToPullOffStem
= n
;
1413 Lexicon
->LogFile(n
, Entropy
, "Entropy sufficiently small.");
1414 } //end of loop on n
1415 Lexicon
->LogFileEndTable();
1416 if (!LowEntropyFlag
)
1417 // Not enough stems share common endings to restructure,
1418 // so leave this signature alone.
1421 const bool analyzingSuffixes
= !is_initial(GetAffixLocation());
1423 const double TotalNumberOfAnalyzedWords
=
1424 Lexicon
->GetSignatures()->GetTotalNumberOfWords();
1425 const double LogTotalNumberOfAnalyzedWords
=
1426 base2log(TotalNumberOfAnalyzedWords
);
1427 const double LengthOfPointerToThisSig
=
1428 LogTotalNumberOfAnalyzedWords
-
1429 base2log(Size() * GetNumberOfStems());
1431 // Description length of the original analysis
1434 // DL of this signature:
1436 // a. Length of pointers to its suffixes; var: LengthOfPointersToAllAffixesOfSig
1437 // b. Prorated responsibility for phonological content of suffixes
1438 // var: TotalResponsibilityForAffixListings
1439 // c. List of pointers from each stem to this signature
1440 // var: StemPointersToThisSig;
1441 // d. List of pointers from each word to its suffix
1443 // Compute DL of 'original' analysis.
1444 Lexicon
->LogFileSmallTitle ("Description length of current signature");
1445 Lexicon
->LogFileHeader("Affix", "Use count", "Pointer to this affix"); ;
1447 double LengthOfPointersToAllAffixesOfSig
= 0.0;
1448 double TotalResponsibilityForAffixListings
= 0.0;
1449 // for each suffix (resp. prefix) in this signature:
1450 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1451 QString Affix
= GetPiece(affixno
).Display();
1452 CAffix
* pAffix
= analyzingSuffixes
1453 ? implicit_cast
<CAffix
*>(
1454 *Lexicon
->GetSuffixes() ^= Affix
)
1455 : implicit_cast
<CAffix
*>(
1456 *Lexicon
->GetPrefixes() ^= Affix
);
1458 // Length of pointers to affixes
1460 const double LengthOfPointerToThisAffix
=
1461 LogTotalNumberOfAnalyzedWords
-
1462 base2log(pAffix
->GetUseCount());
1463 LengthOfPointersToAllAffixesOfSig
+=
1464 LengthOfPointerToThisAffix
;
1466 Lexicon
->LogFile(Affix
, pAffix
->GetUseCount(), LengthOfPointerToThisAffix
);
1468 // use count of affix; length of pointer to this affix.
1469 // Assign partial responsibility for this signature's
1470 // suffixes' entries.
1472 const double LocalProportion
=
1473 double(GetNumberOfStems()) / pAffix
->GetUseCount();
1474 const double ResponsibilityForThisAffixListing
=
1475 LocalProportion
* Affix
.length() * base2log(26);
1476 TotalResponsibilityForAffixListings
+=
1477 ResponsibilityForThisAffixListing
; // in *bits*
1478 }// end of affixno loop
1480 Lexicon
->LogFileEndTable();
1481 Lexicon
->LogFileStartTable();
1482 Lexicon
->LogFile("Part 1: Length of pointer to affixes", LengthOfPointersToAllAffixesOfSig
);
1483 Lexicon
->LogFile("Part 2: Prorated responsibility for phonology of affixes:", TotalResponsibilityForAffixListings
);
1486 const double StemPointersToThisSig
=
1487 GetNumberOfStems() * LengthOfPointerToThisSig
;
1490 const double total_dl
=
1491 LengthOfPointersToAllAffixesOfSig
+
1492 TotalResponsibilityForAffixListings
+
1493 StemPointersToThisSig
;
1494 Lexicon
->LogFile("Part 3: Stem poionters to this sig:", StemPointersToThisSig
);
1495 Lexicon
->LogFile("Length of 1 pointer to this sig: ", LengthOfPointerToThisSig
);
1496 Lexicon
->LogFile("Total", total_dl
);
1497 Lexicon
->LogFileEndTable();
1498 CurrentDL
= total_dl
;
1500 double WinningDL
= CurrentDL
;
1501 int WinningLengthOfStemToShift
= 0;
1503 // We might shift only those stems for which the EndPiece
1504 // occurs in more than 45% of the stems of this sig (that
1505 // leaves open the case of two closely related letters
1506 // comprising almost all of the cases).
1507 // But for now, we're not doing that.
1509 // The outer loop here is for the case where the entropy test
1510 // tells us that 2 or more letters can be shifted
1511 // (e.g., sig on.ve can be shifted either to ion.ive or
1512 // tion.tive), and we want to evaluate both.
1514 // Major loop through alternatives to the current signature
1516 // loop through different lengths to shift:
1517 for (int NumberOfLettersShifted
= LargestSizeChunkToPullOffStem
;
1518 NumberOfLettersShifted
> 0;
1519 --NumberOfLettersShifted
) {
1521 TCollection
<CLParse
> EndPieces
;
1522 foreach (CStem
* pStem
, *m_StemPtrList
) {
1523 if (pStem
->GetKeyLength() <= NumberOfLettersShifted
)
1526 CStringSurrogate stem_text
= pStem
->GetKey();
1527 CStringSurrogate ssPiece
= analyzingSuffixes
1528 ? stem_text
.Right(NumberOfLettersShifted
)
1529 : stem_text
.Left(NumberOfLettersShifted
);
1530 EndPieces
<< ssPiece
;
1533 // XXX. The function is supple enough to move material
1534 // from the stem to the affix in some cases but not in others.
1536 double AllNewSigsAnalysisDL
= 0.0;
1537 double TotalDecreaseInDLDueToShorterStems
= 0.0;
1538 // each of these is a distinct piece being, perhaps,
1539 // transferred from stem(s) to affixes
1540 // for each string of this length that would have to be shifted:
1542 for (int pieceno
= 0; pieceno
< EndPieces
.GetCount(); ++pieceno
) {
1543 CLParse
* pPiece
= EndPieces
.GetAt(pieceno
);
1545 // make a copy to play with.
1548 if (analyzingSuffixes
)
1549 Sig
.PrefixToAllPieces2(pPiece
->GetKey());
1551 Sig
.SuffixToAllPieces2(pPiece
->GetKey());
1553 // DL of this signature:
1555 // a. Length of pointers to its suffixes;
1556 // var: LengthOfPointersToAllAffixesOfSig
1557 // b. Prorated responsibility for phonological
1558 // content of suffixes
1559 // var: TotalResponsibilityForAffixListings
1560 // c. List of pointers from each stem to this
1562 // var: PointersToThisSig;
1563 // d. Savings because stems already existed
1564 // var: SavingsBecauseStemAlreadyExisted
1565 // e. Savings because stems are shorter
1566 // var: TotalDecreaseInDLDueToShorterStems :
1567 // once for each *length* being shifted from
1569 // f. List of pointers from each word to its
1571 // XXX. not implemented.
1573 double LengthOfPointersToAllAffixesOfSig
= 0.0;
1574 double TotalResponsibilityForAffixListings
= 0.0;
1575 if (*Lexicon
->GetSignatures() ^= Sig
) {
1576 // new signature already exists
1577 Lexicon
->LogFileSmallTitle("Alternative analysis already existed", Sig
.Display('-'));
1578 // XXX. address this case!
1581 Lexicon
->LogFileSmallTitle("Conjectured signature: " + Sig
.Display('-'));
1582 Lexicon
->LogFileStartTable();
1583 // iterate through suffixes of the signature
1584 Lexicon
->LogFileHeader("Suffix", "Previous count", "New count", "Pointer length to this affix", "Responsibility for this affix (phonology) in bits:", "New DL for this affix");
1585 double ThisNewSigDL
= 0.0;
1586 // for each suffix (resp prefix) in the new sig:
1587 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1588 CStringSurrogate ssAffix
=
1589 Sig
.GetPiece(affixno
);
1591 CAffix
* pAffix
= analyzingSuffixes
1592 ? implicit_cast
<CAffix
*>(
1593 *Lexicon
->GetSuffixes() ^= ssAffix
)
1594 : implicit_cast
<CAffix
*>(
1595 *Lexicon
->GetPrefixes() ^= ssAffix
);
1598 const double ResponsibilityForThisAffixListing
=
1599 double(ssAffix
.GetLength()) * base2log(26) *
1600 GetNumberOfStems() /
1601 (double(GetNumberOfStems()) +
1602 pAffix
->GetUseCount());
1603 const double LengthOfPointerToThisAffix
=
1604 LogTotalNumberOfAnalyzedWords
-
1605 base2log(pAffix
->GetUseCount() +
1606 GetNumberOfStems());
1608 TotalResponsibilityForAffixListings
+=
1609 ResponsibilityForThisAffixListing
;
1610 LengthOfPointersToAllAffixesOfSig
+=
1611 LengthOfPointerToThisAffix
;
1613 sum
= ResponsibilityForThisAffixListing
+
1614 LengthOfPointerToThisAffix
;
1615 Lexicon
->LogFile (ssAffix
.Display(), pAffix
->GetUseCount(), GetNumberOfStems() + pAffix
->GetUseCount(), LengthOfPointerToThisAffix
, ResponsibilityForThisAffixListing
, sum
);
1619 const double ResponsibilityForThisAffixListing
=
1620 double(ssAffix
.GetLength()) * base2log(26);
1621 const double LengthOfPointerToThisAffix
=
1622 LogTotalNumberOfAnalyzedWords
-
1623 base2log(GetNumberOfStems());
1625 LengthOfPointersToAllAffixesOfSig
+=
1626 LengthOfPointerToThisAffix
;
1627 TotalResponsibilityForAffixListings
+=
1628 ResponsibilityForThisAffixListing
;
1629 sum
= ResponsibilityForThisAffixListing
+
1630 LengthOfPointerToThisAffix
;
1631 Lexicon
->LogFile(ssAffix
.Display(), 0, GetNumberOfStems(), LengthOfPointerToThisAffix
, ResponsibilityForThisAffixListing
, sum
);
1633 ThisNewSigDL
+= sum
;
1635 Lexicon
->LogFile("Total", 0, 0, LengthOfPointersToAllAffixesOfSig
, TotalResponsibilityForAffixListings
, ThisNewSigDL
);
1636 Lexicon
->LogFileEndTable();
1639 // Length of the pointers to the sig from its stems:
1640 double SavingsBecauseStemAlreadyExisted
= 0.0;
1641 double StemPointersToThisSig
;
1642 IterateThroughStems(NumberOfLettersShifted
,
1645 TotalDecreaseInDLDueToShorterStems
,
1646 LogTotalNumberOfAnalyzedWords
,
1647 StemPointersToThisSig
,
1648 SavingsBecauseStemAlreadyExisted
,
1650 const double ThisNewSigDL
=
1651 LengthOfPointersToAllAffixesOfSig
+
1652 TotalResponsibilityForAffixListings
+
1653 StemPointersToThisSig
+
1654 -SavingsBecauseStemAlreadyExisted
+
1655 -TotalDecreaseInDLDueToShorterStems
;
1656 AllNewSigsAnalysisDL
+= ThisNewSigDL
;
1657 Lexicon
->LogFile("Part 1: Length of pointer to affixes: ", LengthOfPointersToAllAffixesOfSig
);
1658 Lexicon
->LogFile("Part 2: Prorated responsibility for phonology of affixes: ", TotalResponsibilityForAffixListings
);
1659 Lexicon
->LogFile("Part 3: Stem pointers to this sig:", StemPointersToThisSig
);
1660 Lexicon
->LogFile("Length of 1 poitner to this sig: ", LengthOfPointerToThisSig
);
1661 Lexicon
->LogFile("Part 4: Total savings from stems that had already existed", SavingsBecauseStemAlreadyExisted
);
1662 Lexicon
->LogFile("Part 5: Total decrease in DL due to shorter stems: ", TotalDecreaseInDLDueToShorterStems
);
1663 Lexicon
->LogFile("Total DL: ", ThisNewSigDL
);
1665 if (Lexicon
->LogFileOn()) *Lexicon
->GetLogFile() <<
1667 QString("If we add %1 letters, total TD is %2")
1668 .arg(NumberOfLettersShifted
).arg(AllNewSigsAnalysisDL
) <<
1669 endl
<< "******" << endl
<<
1672 if (AllNewSigsAnalysisDL
< WinningDL
) {
1673 WinningDL
= AllNewSigsAnalysisDL
;
1674 WinningLengthOfStemToShift
= NumberOfLettersShifted
;
1679 if (WinningDL
!= CurrentDL
) {
1680 if (Lexicon
->LogFileOn()) *Lexicon
->GetLogFile() <<
1682 "Change signature from \"%1\" to \"%2\"")
1683 .arg(Display(), WinningSig
.Display('.'))) <<
1685 Lexicon
->AddToScreen(
1687 .arg(Display('.'), WinningSig
.Display('.')));
1688 return WinningLengthOfStemToShift
;
1690 if (Lexicon
->LogFileOn()) *Lexicon
->GetLogFile() <<
1692 "%1: Conclusion: Keep original signature.")
1698 // <<-------------------------------------------------------------------------------------------------------->>
1699 void CSignature::IterateThroughStems( int NumberOfLettersShifted
,
1700 CMiniLexicon
* Lexicon
,
1702 double& TotalDecreaseInDLDueToShorterStems
,
1703 double LogTotalNumberOfAnalyzedWords
,
1704 double& StemPointersToThisSig
,
1705 double& SavingsBecauseStemAlreadyExisted
,
1706 bool analyzingSuffixes
)
1712 int HowManyStemsForThisSig
= 0; //check that
1713 int NumberOfShortenedStemsThatPreExisted
= 0;
1714 double ThisSavingBecauseStemAlreadyExisted
= 0;
1715 double DecreaseInDLDueToShorterStems
= 0;
1716 double LengthOfPointerToThisSig
= 0;
1719 TotalDecreaseInDLDueToShorterStems
= 0;
1720 SavingsBecauseStemAlreadyExisted
= 0;
1722 Lexicon
->LogFile (pPiece
->Display() );
1723 Lexicon
->LogFileHeader( "Current stem", "Proposed stem", "Savings from preexisting stem");
1726 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
1728 pStem
= m_StemPtrList
->at(stemno
);
1729 ThisSavingBecauseStemAlreadyExisted
=0;
1730 int StemLength
= pStem
->GetKeyLength();
1731 ssNewStem
= pStem
->GetKey().Left(
1732 StemLength
- NumberOfLettersShifted
);
1733 Lexicon
->LogFileStartRow();
1734 if ( analyzingSuffixes
) // Suffixes
1736 if ( pStem
->GetKey().Right(NumberOfLettersShifted
).Display() == pPiece
->Display() )
1738 HowManyStemsForThisSig
++;
1739 Lexicon
->LogFile1SimpleString(pStem
->Display());
1740 Lexicon
->LogFile1SimpleString(ssNewStem
.Display());
1745 Lexicon
->LogFile1SimpleString(pStem
->Display());
1746 Lexicon
->LogFile1SimpleString(ssNewStem
.Display());
1749 ssNewStem
= pStem
->GetKey().Left( pStem
->GetKeyLength() - NumberOfLettersShifted
);
1753 if ( pStem
->GetKey().Left(NumberOfLettersShifted
).Display() == pPiece
->Display() )
1755 HowManyStemsForThisSig
++;
1761 ssNewStem
= pStem
->GetKey().Right( pStem
->GetKeyLength() - NumberOfLettersShifted
);
1766 if ( Lexicon
->GetStems()->Contains( ssNewStem
) || // ** Was: "GetStems_Suffixed
1767 Lexicon
->GetWords()->Contains( ssNewStem
) )
1769 NumberOfShortenedStemsThatPreExisted
++;
1770 ThisSavingBecauseStemAlreadyExisted
= ssNewStem
.GetLength() * base2log (26);
1771 SavingsBecauseStemAlreadyExisted
+= ThisSavingBecauseStemAlreadyExisted
;
1773 // ** Add the cost of having a pointer to the stem ******
1777 if ( Lexicon
->LogFileOn() &&
1778 ( pStem
->GetKey().Right(NumberOfLettersShifted
).Display() == pPiece
->Display() ) )
1781 if ( ThisSavingBecauseStemAlreadyExisted
> 0)
1783 Lexicon
->LogFileSimpleString("ThisSavingBecauseStemAlreadyExisted");
1786 Lexicon
->LogFileSimpleString("none (did not exist)");
1789 Lexicon
->LogFileEndRow();
1792 DecreaseInDLDueToShorterStems
= ( HowManyStemsForThisSig
- NumberOfShortenedStemsThatPreExisted
) *
1793 NumberOfLettersShifted
* base2log (26);
1794 TotalDecreaseInDLDueToShorterStems
+= DecreaseInDLDueToShorterStems
;
1797 LengthOfPointerToThisSig
= LogTotalNumberOfAnalyzedWords
- base2log ( Size() * HowManyStemsForThisSig
) ;
1798 StemPointersToThisSig
= HowManyStemsForThisSig
* ( LengthOfPointerToThisSig
) ;
1799 if ( Lexicon
-> LogFileOn() )
1801 *Lexicon
->GetLogFile() << // FILL THIS IN --
1805 MakeTableHeader("Current stem") <<
1806 MakeTableHeader("Proposed stem") <<
1807 MakeTableHeader("Savings from preexisting stem") <<
1814 bool CSignature::IsValid()
1815 // tests that pieces of the signature are all non-null
1816 { for (int affixno
= 1; affixno
<= m_PieceCount
; affixno
++) {
1817 if ( GetPiece(affixno
).GetLength() < 1 ) {
1823 // <<-------------------------------------------------------------------------------------------------------->>
1824 void CSignature::DetachStem(CStem
* pStem
, detachment_parameter Parameter
)
1826 if( !m_StemPtrList
->isEmpty() &&
1827 m_StemPtrList
->indexOf( pStem
) >= 0 &&
1828 m_StemPtrList
->remove( pStem
) )
1830 IncrementCorpusCount( -1 * pStem
->GetCorpusCount() );
1832 if (Parameter
!= eDo_Not_Call_Words
) {
1834 for (int wordno
= 0; wordno
< pStem
->GetNumberOfWords(); wordno
++) {
1835 pWord
= pStem
->GetWordPtrList()->at(wordno
);
1836 const int index
= m_WordPtrList
->indexOf(pWord
);
1838 m_WordPtrList
->removeAt(index
);
1843 // <<-------------------------------------------------------------------------------------------------------->>
1844 void CSignature::DetachWord(CStem
* pWord
, enum detachment_parameter param
)
1846 struct not_implemented
{ };
1847 throw not_implemented();
1849 // Suppress warnings.
1850 static_cast<void>(pWord
);
1851 static_cast<void>(param
);
1853 // <<-------------------------------------------------------------------------------------------------------->>
1854 void CSignature::TakeAllStems(CSignature
* source
)
1856 //QList<CStem*>& source_stems = *source->GetStemPtrList();
1858 for (int stemno
= 0; stemno
< source
->GetNumberOfStems(); stemno
++)
1860 pStem
=source
->GetStem(stemno
);;
1861 pStem
->SetSuffixList(this);
1862 AppendStemPtr(pStem
);
1863 IncrementCorpusCount(pStem
->GetCorpusCount());
1865 // Remove items from source.
1866 //Q_ASSERT(!source_stems.autoDelete());
1867 //source_stems.clear();
1868 source
->ClearStemPtrList();
1870 // XXX. Decrement source corpus count in turn?
1871 // Hard to tell, since there are no call sites.
1873 // <<-------------------------------------------------------------------------------------------------------->>
1874 void CSignature::AddWord (CStem
* pWord
)
1876 m_WordPtrList
->append (pWord
);
1877 IncrementCorpusCount (pWord
->GetCorpusCount() );
1880 void CSignature::ClearStemPtrList() { m_StemPtrList
->clear(); }
1881 void CSignature::AppendWordPointer(CStem
* pWord
) { m_WordPtrList
->append(pWord
); }
1882 void CSignature::AppendPrefixPtr(CPrefix
* pPrefix
) { m_PrefixPtrList
->append (pPrefix
);}
1883 int CSignature::GetNumberOfWords() const
1885 return m_WordPtrList
->count();
1888 // <<-------------------------------------------------------------------------------------------------------->>
1889 CParse
CSignature::CreateADeletingSignature( CParse
& Deletee
, CMiniLexicon
* Lexicon
)
1891 CStringSurrogate ssSuffix
;
1898 QString Null
= "NULL", lt_brak
= "<", rt_brak
= "<";
1901 Q_ASSERT (Deletee
.Size() == 1);
1903 for (int affixno
= 1; affixno
<= Size(); affixno
++)
1905 ssSuffix
= GetPiece(affixno
);
1906 if(NewSig
.GetSortStyle() != eAlphabetized
) NewSig
.Alphabetize();
1907 if ( ssSuffix
== Deletee
)
1909 NewSig
.Append ( CStringSurrogate(Null
.unicode(),0,Null
.length() ) );
1913 PSuffix
= CStringSurrogate(lt_brak
.unicode(),0,1);
1915 PSuffix
+= CStringSurrogate(rt_brak
.unicode(),0,1);
1916 PSuffix
.ClearParseStructure();
1917 PSuffix
+= ssSuffix
;
1918 NewSig
.Append ( PSuffix
.GetKey() );
1920 pSuffix
= *Lexicon
->GetSuffixes() << PSuffix
;
1922 QString line
= "<" + Deletee
.Display() + ">" + ssSuffix
.Display();
1923 Suffix
= CStringSurrogate( line
.unicode(),0,line
.length());
1925 NewSig
.Append (Suffix
.GetKey());
1926 // Lexicon->SetSuffixTranslation(this, ssSuffix, Suffix);
1933 // <<-------------------------------------------------------------------------------------------------------->>
1934 bool CSignature::RemoveStem(CStem
* pStem
)
1936 return m_StemPtrList
->remove( pStem
);
1938 // <<-------------------------------------------------------------------------------------------------------->>
1941 bool CSignature::RemoveWord(CStem
* pWord
)
1943 return m_WordPtrList
->remove( pWord
);
1945 // <<-------------------------------------------------------------------------------------------------------->>
1946 // copy out affixes, with null affix replaced with "NULL",
1947 // possibly with deletees marked with angle brackets
1948 CSignature
& CSignature::Express(CSignature
& Output
, bool bDisplayDeletees
)
1950 CSuffixCollection
* Suffixes
= 0;
1951 CPrefixCollection
* Prefixes
= 0;
1952 if (!is_initial(GetAffixLocation()))
1953 Suffixes
= GetSignatureCollection()->GetMySuffixes();
1955 Prefixes
= GetSignatureCollection()->GetMyPrefixes();
1957 Output
.ClearParse();
1959 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1960 CStringSurrogate affix_text
= GetPiece(affixno
);
1962 if (affix_text
.IsNULL()) {
1963 Output
.Append(TheStringNULL
);
1966 if (!is_initial(m_AffixLocation
)) {
1967 CSuffix
* suffix
= *Suffixes
^= affix_text
;
1968 Q_ASSERT(suffix
!= 0);
1972 suffix
->Express(Temp
, bDisplayDeletees
));
1974 CPrefix
* prefix
= *Prefixes
^= affix_text
;
1975 Q_ASSERT(prefix
!= 0);
1979 prefix
->Express(Temp
, bDisplayDeletees
));
1984 // <<-------------------------------------------------------------------------------------------------------->>
1985 /// concatenate affixes, separated by -.
1986 QString
CSignature::Express(bool bDisplayDeletees
)
1988 CSuffixCollection
* Suffixes
= 0;
1989 CPrefixCollection
* Prefixes
= 0;
1990 if (!is_initial(GetAffixLocation()))
1991 Suffixes
= GetSignatureCollection()->GetMySuffixes();
1993 Prefixes
= GetSignatureCollection()->GetMyPrefixes();
1996 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1997 CStringSurrogate affix_text
= GetPiece(affixno
);
1999 if (affix_text
.IsNULL()) {
2000 if (!Outstring
.isEmpty())
2001 Outstring
.append('-');
2002 Outstring
.append(TheStringNULL
);
2006 if (is_initial(m_AffixLocation
)) {
2007 CPrefix
* prefix
= *Prefixes
^= affix_text
;
2008 Q_ASSERT(prefix
!= 0);
2009 if (!Outstring
.isEmpty())
2010 Outstring
.append('-');
2013 Outstring
.append(prefix
->Express(Temp
,
2014 bDisplayDeletees
).Display());
2016 CSuffix
* suffix
= *Suffixes
^= affix_text
;
2017 Q_ASSERT(suffix
!= 0);
2018 if (!Outstring
.isEmpty())
2019 Outstring
.append('-');
2022 Outstring
.append(suffix
->Express(Temp
,
2023 bDisplayDeletees
).Display());
2029 // <<-------------------------------------------------------------------------------------------------------->>
2031 // this should probably be replaced by ComputeDLofModel
2033 double CSignature::ComputeDL( int char_count )
2038 CStringSurrogate Affix;
2040 bool CORPUS_BASED_AFFIX_COUNT = m_pMyMini->GetIntParameter( "SignatureDL\\CorpusBasedAffixCount", 0 );
2041 bool CORPUS_BASED_STEM_COUNT = m_pMyMini->GetIntParameter( "SignatureDL\\CorpusBasedStemCount", 1 );
2043 double stems_dl = 0.0,
2046 uint stem_total = 0,
2049 if( CORPUS_BASED_STEM_COUNT )
2051 for( pStem = m_StemPtrList->first(); pStem; pStem = m_StemPtrList->next() )
2053 stems_dl += ( (double) -1 ) * base2log( (double) pStem->GetCorpusCount() / (double) m_pMyMini->GetCorpusCount() );
2058 for( pStem = m_StemPtrList->first(); pStem; pStem = m_StemPtrList->next() )
2060 stems_dl = ( (double) -1 ) * base2log( (double) pStem->GetWordPtrList()->count() / (double) m_pMyMini->GetWords()->GetCount() );
2064 bool analyzedSuffixes = TRUE;
2065 if( GetAffixLocation() == STEM_INITIAL || GetAffixLocation() == WORD_INITIAL ) analyzedSuffixes = FALSE;
2068 if( !CORPUS_BASED_AFFIX_COUNT )
2070 for( i = 1; i <= m_PieceCount; i++ )
2072 Affix = GetPiece(i);
2074 if( analyzedSuffixes )
2076 pAffix = *m_pMyMini->GetSuffixes() ^= Affix;
2080 pAffix = *m_pMyMini->GetPrefixes() ^= Affix;
2083 if( pAffix ) affix_total += pAffix->GetCorpusCount();
2087 for( i = 1; i <= m_PieceCount; i++ )
2089 Affix = GetPiece(i);
2091 if( analyzedSuffixes )
2093 pAffix = *m_pMyMini->GetSuffixes() ^= Affix;
2097 pAffix = *m_pMyMini->GetPrefixes() ^= Affix;
2102 if( CORPUS_BASED_AFFIX_COUNT ) affixes_dl += ( (double) -1 ) * base2log( (double) pAffix->GetCorpusCount() / (double) m_pMyMini->GetCorpusCount() );
2103 else affixes_dl += ( (double) -1 ) * base2log( (double) pAffix->GetCorpusCount() / (double) affix_total );
2107 return stems_dl + affixes_dl;
2110 // <<-------------------------------------------------------------------------------------------------------->>
2111 //====================================================================//
2112 // Description Length //
2113 //====================================================================//
2114 double CSignature::GetDLofMyAffixPointers( )
2116 if (m_DLofMyAffixPointers
== 0)
2118 bool analyzedSuffixes
= TRUE
;
2121 if( GetAffixLocation() == STEM_INITIAL
|| GetAffixLocation() == WORD_INITIAL
) analyzedSuffixes
= FALSE
;
2122 if (analyzedSuffixes
)
2124 for (int suffixno
= 0; suffixno
< GetSuffixPtrList()->size(); suffixno
++)
2125 { pSuffix
= GetSuffixPtrList()->at(suffixno
);
2126 m_DLofMyAffixPointers
+= pSuffix
->GetLengthOfPointerToMe ();
2131 for (int prefixno
= 0; prefixno
< GetPrefixPtrList()->size(); prefixno
++)
2133 pPrefix
= GetPrefixPtrList()->at(prefixno
);
2134 m_DLofMyAffixPointers
+= pPrefix
->GetLengthOfPointerToMe ();
2138 return m_DLofMyAffixPointers
;
2140 // <<-------------------------------------------------------------------------------------------------------->>
2141 double CSignature::GetDLofMyStemPointers()
2143 if (m_DLofMyStemPointers
== 0)
2146 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
2148 pStem
= GetStem(stemno
);
2149 m_DLofMyStemPointers
+= pStem
->GetLengthOfPointerToMe ();
2152 return m_DLofMyStemPointers
;
2154 // <<-------------------------------------------------------------------------------------------------------->>
2155 double CSignature::ComputeDLofModel(int /* char_count, not used */)
2157 // XXX. take SignatureDL\CorpusBased{Stem,Affix}Count parameters
2160 m_DLofMyStemPointers
= GetDLofMyStemPointers();
2161 m_DLofMyAffixPointers
= GetDLofMyAffixPointers();
2162 return m_DLofMyStemPointers
+ m_DLofMyAffixPointers
;
2164 // <<-------------------------------------------------------------------------------------------------------->>
2165 double CSignature::ComputeDLofMyCorpus()
2167 using linguistica::implicit_cast
;
2172 m_DLofMyCorpus
= 0.0;
2173 foreach (CStem
* pWord
, *m_WordPtrList
) {
2174 CStringSurrogate stem_text
= pWord
->GetStem();
2175 CStem
* stem
= *m_pMyMini
->GetStems() ^= stem_text
;
2180 std::cout
<< "NULL stem -- in CSignature::ComputeDLofMyCorpus() "<< std::endl
;
2181 std::cout
<< " word: "<<pWord
->Display().toStdString()<< std::endl
;
2182 std::cout
<< " stem: "<< stem_text
.Display().toStdString()<<std::endl
;
2183 CStringSurrogate afx_str
2184 = (is_initial(m_AffixLocation
) ? pWord
->GetPrefix() : pWord
->GetSuffix());
2185 std::cout
<< " affix:"<< afx_str
.Display().toStdString() << std::endl
;
2186 std::cout
<< std::endl
;
2191 CStringSurrogate affix_text
= is_initial(m_AffixLocation
)
2192 ? pWord
->GetPrefix()
2193 : pWord
->GetSuffix();
2194 if (affix_text
.GetLength() == 0)
2195 affix_text
= TheStringNULL
;
2197 CAffix
* affix
= is_initial(m_AffixLocation
)
2198 ? implicit_cast
<CAffix
*>(
2199 *m_pMyMini
->GetPrefixes() ^= affix_text
)
2200 : implicit_cast
<CAffix
*>(
2201 *m_pMyMini
->GetSuffixes() ^= affix_text
);
2203 CStem
* word
= *m_pMyMini
->GetWords() ^= pWord
;
2204 const double ThisWordDL
=
2205 stem
->GetLengthOfPointerToMe() +
2206 affix
->GetLengthOfPointerToMe();
2207 m_DLofMyCorpus
+= word
->GetCorpusCount() * ThisWordDL
;
2209 return m_DLofMyCorpus
;
2211 // <<-------------------------------------------------------------------------------------------------------->>
2214 /// Get the corpus counts of each suffix with this stem
2215 int* GetSuffixCounts(CStem* stem, int* output)
2217 if (output) delete output; // error if this occurs.
2218 output = new int[ stem->GetNumberOfSuffixes() ];
2220 for (int i = 1; i <= stem->GetSuffixList()->Size(); ++i) {
2221 QString Suffix = stem->GetSuffixList()->GetPiece(i).Display();
2222 if (Suffix == "NULL")
2224 QString Word = stem->Display() + Suffix;
2225 CStem* pWord = *stem->GetMyMini()->GetWords() ^=
2226 CStringSurrogate(Word);
2228 output[i-1] = pWord->GetCorpusCount();
2234 //the output is a vector of integers, whose length is
2235 // the number of stems times the number of suffixes. Pass it
2236 // an int pointer that points to NULL; it will delete the memory
2237 // that this function creates.
2238 int* CSignature::GetIndividualCountsForEachStem (int* output )
2240 int affixno, stemno;
2244 if (output) delete output; //if this occurs, it's an error.
2245 output = new int [GetNumberOfStems() * GetNumberOfAffixes() ];
2247 CMiniLexicon* pMiniLexicon = GetLexicon();
2248 NOT FINISHED YET _--- use GETaWord -- JG
2249 for (stemno = 0; stemno < m_StemPtrList->size(); stemno++)
2251 pSt em = m_StemPtrList->at(stemno);
2252 temp = GetSuffixCounts(pStem, temp);
2253 for (affixno = 0; affixno < GetNumberOfAffixes(); affixno++)
2255 output[stemno * GetNumberOfAffixes() + affixno] = temp[affixno];
2264 //===================================================================================================//
2266 // Description length
2268 //===================================================================================================//
2269 double CSignature::GetSumOfDLofInternalPointers()
2272 double StemTotal
= 0, SuffixTotal
= 0;
2276 CSuffixCollection
& Suffixes
= *m_pMyMini
->GetSuffixes();
2277 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
2279 pStem
= m_StemPtrList
->at(stemno
);
2280 StemTotal
+= pStem
->GetLengthOfPointerToMe_2 ();
2283 for (int affixno
= 1; affixno
<= GetNumberOfAffixes(); affixno
++)
2285 ssSuffix
= GetPiece(affixno
);
2286 pSuffix
= Suffixes
^= ssSuffix
;
2287 SuffixTotal
+= pSuffix
->GetLengthOfPointerToMe();
2289 return StemTotal
+ SuffixTotal
;
2291 // <<-------------------------------------------------------------------------------------------------------->>
2293 void CSignature::SetLengthOfPointerToMe(double L
)
2295 m_LengthOfPointerToMe
= L
;
2299 // <<-------------------------------------------------------------------------------------------------------->>
2301 void CSignature::AppendSatelliteAffix(CParse
& suffix
)
2304 m_SatelliteAffixes
.Append(suffix
);
2307 //===================================================================================================//
2311 //===================================================================================================//
2312 bool CSignature::Generalizes(CSignature
* pSig
)
2314 struct not_implemented
{ };
2315 throw not_implemented();
2317 // 1. Check they have the same length; find which one is longer.
2318 // 2. Go from longest to shortest pieces of the longer signature:
2319 // look for unambiguous correspondents in the other signature, and
2320 // put those pairs of corresponding affixes in some structure.
2321 // 3. After unambiguous cases, deal with ambiguous cases, if any exist.
2322 // 4. Find alignment
2324 // ed |NULL | NULL | ed |
2325 // ing|NULL | NULL | ing |
2326 // es |e | NULL | s |
2327 // e |e | NULL | NULL|
2330 // ed |e | <e> | ed |
2331 // ing|e | <e> | ing |
2332 // es |e | NULL | s |
2333 // e |e | NULL | NULL|
2336 // ien |ien | NULL | NULL |
2337 // ienne |ienn | NULL | e |
2338 // iens |ien | NULL | s |
2339 // iennes |ienn | NULL | es |
2341 // ien |ien | NULL | NULL |
2342 // ienne |ien | n | e |
2343 // iens |ien | NULL | s |
2344 // iennes |ien | n | es |
2346 CSignature
* LongerSig
, *ShorterSig
;
2355 if (Size() != pSig
->Size())
2358 const int dif
= GetKeyLength() - pSig
->GetKeyLength();
2360 LongerSig
= this; ShorterSig
= pSig
;
2361 } else if (dif
== 0) {
2364 LongerSig
= pSig
; ShorterSig
= this;
2367 const int MAXAFFIXSIZE
= 10;
2369 QStringList ShorterSigPieces
;
2372 // Copy the affixes of ShorterSig,
2373 // from shortest to longest
2374 // onto the list ShorterSigPieces.
2375 if (ShorterSig
->ContainsNULL())
2376 ShorterSigPieces
.append(TheStringNULL
);
2377 for (int m
= 1; m
< MAXAFFIXSIZE
&&
2378 ShorterSigPieces
.count() < ShorterSig
->Size();
2380 // XXX. this test makes no sense
2381 if (ShorterSig
->ThisPieceLength(m
) == m
)
2382 ShorterSigPieces
.prepend(
2383 ShorterSig
->GetPiece(m
).Display());
2385 Q_ASSERT(ShorterSigPieces
.count() == ShorterSig
->Size());
2388 QStringList LongerSigPieces
;
2390 // Copy the affixes of LongerSig,
2391 // from shortest to longest
2392 // onto the list LongerSigPieces.
2393 if (LongerSig
->ContainsNULL())
2394 LongerSigPieces
.append(TheStringNULL
);
2395 for (int m
= 1; m
< MAXAFFIXSIZE
&&
2396 LongerSigPieces
.count() < LongerSig
->Size();
2398 if (LongerSig
->ThisPieceLength(m
) == m
)
2399 LongerSigPieces
.prepend(
2400 LongerSig
->GetPiece(m
).Display());
2401 Q_ASSERT(LongerSigPieces
.count() == LongerSig
->Size());
2404 CStringSurrogate ssIng
, ssTing
;
2405 foreach (QString shortersig_piece
, ShorterSigPieces
) {
2407 CStringSurrogate
short_affix(shortersig_piece
);
2409 foreach (QString longersig_piece
, LongerSigPieces
) {
2411 CStringSurrogate
long_affix(longersig_piece
);
2412 if (long_affix
.IsNULL())
2414 if (short_affix
!= long_affix
.Right(
2415 short_affix
.GetLength()))
2417 bool unambiguous_match
= !match
;
2421 if (!unambiguous_match
)
2426 long_affix
.Display();
2427 ThisRow
.ShortAffix
=
2428 short_affix
.Display();
2429 ThisRow
.Extension
= long_affix
.Left(
2430 long_affix
.GetLength() -
2431 short_affix
.GetLength())
2433 // XXX. use ThisRow...
2434 static_cast<void>(ThisRow
);
2439 // <<-------------------------------------------------------------------------------------------------------->>
2440 // <<-------------------------------------------------------------------------------------------------------->>
2441 void CSignature::CutMyWordsAsIDeclare()
2444 if ( is_initial (GetAffixLocation()) )
2446 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++) {
2447 stem
= GetStem(stemno
);
2449 // For each prefix in signature:
2450 for (int prefixno
= 1; prefixno
<= Size(); ++prefixno
) {
2451 CStringSurrogate prefix
= GetPiece(prefixno
);
2453 prefix
.SetBackwards(false);
2454 if (prefix
.IsNULL())
2455 // NULL + stem prefix needs no cut
2458 // get correspond word
2459 CParse word_text
= prefix
+ stem
->GetKey();
2460 CStem
* word
= *GetLexicon()->GetWords() ^= word_text
;
2461 Q_ASSERT(word
!= 0);
2463 if (word
->Size() > 1 )
2466 GetLexicon()->LogFile ("", "", word
->GetKey().Display());
2469 const int cut_point
= word
->GetKeyLength() - stem
->GetKeyLength();
2470 word
->CutRightBeforeHere(cut_point
);
2471 word
->SetStemLoc(2);
2472 word
->SetPrefixLoc(1);
2473 //m_pLexicon->UpdateWord(word);
2479 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++) {
2480 stem
= GetStem(stemno
);
2482 // For each affix in signature:
2483 for (int suffixno
= 1; suffixno
<= Size(); ++suffixno
) {
2484 CStringSurrogate suffix
= GetPiece(suffixno
);
2486 suffix
.SetBackwards(false);
2487 if (suffix
.IsNULL())
2488 // stem + NULL suffix needs no cut
2491 // get correspond word
2492 CParse word_text
= stem
->GetKey() + suffix
;
2493 CStem
* word
= *GetLexicon()->GetWords() ^= word_text
;
2494 Q_ASSERT(word
!= 0);
2496 if (word
->Size() > 1 )
2499 GetLexicon()->LogFile ("", "", word
->GetKey().Display());
2502 const int cut_point
= word
->GetKeyLength() - stem
->GetKeyLength();
2503 word
->CutRightBeforeHere(cut_point
);
2504 word
->SetStemLoc(1);
2505 //m_pLexicon->UpdateWord(word);
2511 void CSignature::OutputSignatureXfst( QTextStream
& outf
, int count
)
2519 outf
<< "# " << count
<< ": " << Display('.', m_pMyMini
->GetOutFilter()) << endl
;
2520 if (this->GetMentorList()->count() > 0)
2521 outf
<< "# MentorList() size: " << this->GetMentorList()->count() << endl
;
2523 outf
<< "# No MentorList() items" << endl
;
2525 outf
<< "# robustness: " << m_Robustness
<< endl
;
2527 if( GetMentor()!=NULL
)
2529 outf
<< "# Has mentor: skipping" << endl
;
2533 outf
<< "define STEM" << count
<< " "; // << " \\" << endl;
2537 for (int i
= 0; i
< this->GetNumberOfStems(); i
++)
2539 stems
.append( this->GetStem(i
)->Display() );
2542 // add stems from child sigs
2544 for (int z = 0; z < this->GetMentorList()->size(); z++)
2546 CSignature * qSig = this->GetMentorList()->at(z);
2548 QStringList qSufList;
2549 for (int i = 0; i < qSig->GetNumberOfAffixes(); i++)
2550 qSufList.append(qSig->GetSuffix(i)->Display());
2552 //generate new words here:
2553 for (int i = 0; i < this->GetNumberOfAffixes(); i++)
2556 CSuffix* pSuf = this->GetSuffix(i);
2557 QString sufStr = pSuf->Display( 0 );//, m_pMyMini->GetOutFilter() );
2558 if ( !qSufList.contains(sufStr) )
2560 outf<< "#### Suffix to be expanded: "<< sufStr << endl;
2561 for (int j = 0; j < qSig->GetNumberOfStems(); j++)
2563 QString stemStr = qSig->GetStem(j)->Display();
2564 if (sufStr.compare("NULL") == 0)
2565 outf << "### "<< stemStr << endl;
2567 outf << "### "<< stemStr << " " << sufStr << endl;
2573 // add stems from child sigs
2574 for (int z
= 0; z
< this->GetMentorList()->size(); z
++)
2576 CSignature
* qSig
= this->GetMentorList()->at(z
);
2577 for (int i
= 0; i
< qSig
->GetNumberOfStems(); i
++)
2579 stems
.append( qSig
->GetStem(i
)->Display( 0, m_pMyMini
->GetOutFilter() ) );
2586 QStringList::Iterator strIt
= stems
.begin();
2587 outf
<< "[ {" << *strIt
<< "} ";
2591 for( ; strIt
!= stems
.end(); ++strIt
)
2598 outf
<< "| {" << *strIt
<< "} ";
2602 outf
<< "]; "<<endl
;
2603 outf
<< "define SUF" << count
<< " [ ";
2604 QStringList suffixes
;
2607 for (int i
= 0; i
< this->GetNumberOfAffixes(); i
++)
2609 CSuffix
* pSuffix
= this->GetSuffix(i
);
2614 QString str
= pSuffix
->Display( 0 );
2615 if (str
.compare("NULL") == 0)
2618 outf
<< " {" << str
<< "} ";
2621 outf
<< "];" << endl
;
2623 outf
<< "define SIG" << count
<< " STEM" << count
<< " SUF"<< count
<< ";" << endl
;
2625 outf
<< "push SIG"<< count
<< endl
;
2627 /* TEMP SOLN: now write cross product in comments */
2628 for ( QStringList::Iterator strIt
= stems
.begin() ; strIt
!= stems
.end(); ++strIt
)
2630 //QList<CSuffix*>::iterator suffix_it = m_SuffixPtrList->begin();
2632 //while ( (pSuffix = *suffix_it) != 0 )
2634 for (int i
= 0; i
< this->GetNumberOfAffixes(); i
++)
2636 CSuffix
* pSuffix
= this->GetSuffix(i
);
2637 QString str
= pSuffix
->Display( 0 );//, m_pMyMini->GetOutFilter() );
2638 if (str
.compare("NULL") == 0)
2639 outf
<< "## "<< *strIt
<< endl
;
2641 outf
<< "## "<< *strIt
<< str
<< endl
;
2648 //--------------------------------------------------------------------------//
2649 void CSignature::RecalculateStemAndWordPointers()
2650 //--------------------------------------------------------------------------//
2653 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
2655 QString stem
= GetStem(stemno
)->Display();
2656 switch (m_AffixLocation
)
2660 for (int suffixno
= 0; suffixno
< GetNumberOfAffixes(); suffixno
++)
2662 QString suffix
= GetSuffix(suffixno
)->Display();
2663 if (suffix
== "NULL") suffix
= "";
2664 QString word
= stem
+ suffix
;
2665 CStem
* pWord
= *GetLexicon()->GetWords() ^= word
;
2666 AppendWordPointer( pWord
);
2671 for (int prefixno
= 0; prefixno
< GetNumberOfAffixes(); prefixno
++)
2673 QString prefix
= GetPrefix(prefixno
)->Display();
2674 if (prefix
== "NULL") prefix
= "";
2675 QString word
= prefix
+ stem
;
2676 CStem
* pWord
= *GetLexicon()->GetWords() ^= word
;
2677 AppendWordPointer(pWord
);
2680 } // end of stemno loop
2682 //--------------------------------------------------------------------------//