1 // Implementation of CTrie methods
2 // Copyright © 2009 The University of Chicago
9 #include <QTextOStream>
10 #include "linguisticamainwindow.h"
11 #include "ui/Status.h"
12 #include "StringSurrogate.h"
15 #include "MorphemeCollection.h"
16 #include "WordCollection.h"
17 #include "CompareFunc.h"
21 /////////////////////////////////////////////////////////////
23 CTrieListViewItem::CTrieListViewItem( Q3ListView
*parent
,
25 : Q3ListViewItem( parent
, label1
)
29 CTrieListViewItem::CTrieListViewItem( Q3ListViewItem
*parent
,
31 : Q3ListViewItem( parent
, label1
)
34 QString
CTrieListViewItem::key( int column
, bool ascending
) const
39 return Q3ListViewItem::key( column
, ascending
);
43 QString
CTrieListViewItem::text( int column
) const
48 return Q3ListViewItem::text( column
);
54 /////////////////////////////////////////////////////////////
56 CTrie::CTrie(bool ReverseFlag
)
57 : m_Root(new CNode()),
59 m_ReverseFlag(ReverseFlag
),
63 m_TrieHasChangedFlag(true),
64 m_IsAlphabetized(false) { }
72 void CTrie::ResetToEmpty()
74 delete m_Root
; // destructor deletes all child nodes
78 m_TrieHasChangedFlag
= true;
79 m_IsAlphabetized
= false;
80 // m_NodeArray, m_ReverseFlag, and m_AutoDelete remain unchanged.
83 CNode
* CTrie::operator<< ( CStringSurrogate ssS
)
89 CNode
* CTrie::operator<< ( CParse
* Parse
)
91 CStringSurrogate ss
= Parse
->GetKey();
95 void CTrie::ListDisplay(Q3ListView
* List
, LinguisticaMainWindow
* doc
,
96 QMap
<QString
, QString
>* filter
, bool ReverseFlag
)
99 // Remove all previous columns
100 while( List
->columns() ) List
->removeColumn( 0 );
103 if( m_ReverseFlag
) List
->addColumn( "Reverse Trie" );
104 else List
->addColumn( "Forward Trie" );
106 if( doc
) // TODO: is doc set and if so are these functions below right
108 doc
->status_display().major_operation
= "Preparing trie for display";
109 doc
->status_display().progress
.set_denominator(m_Count
);
110 m_Root
->ListDisplay( List
, doc
, &count
, filter
, ReverseFlag
);
111 doc
->status_display().major_operation
.clear();
112 doc
->status_display().progress
.clear();
114 else m_Root
->ListDisplay( List
, NULL
, NULL
, filter
, ReverseFlag
);
118 void CTrie::FindSFBumps( CMorphemeCollection
& TempMorphemes
, int SFThreshold
)
120 m_Root
->FindSFBumps ( TempMorphemes
, SFThreshold
);
124 void CTrie::MakeCutsWhereCountExceeds( int count
, int start
, int end
)
126 m_Root
->MakeCutsWhereCountExceeds( count
, start
, end
);
129 //----------------------------------------------------------------
130 ////////////////////////////////////////////////////////////////
131 //----------------------------------------------------------------
135 CNode
* CTrie::Insert( CStringSurrogate ssS
, int* pResult
)
139 m_Root
->Insert(ssS
, m_NumberOfNodes
, &pNode
, pResult
);
141 if ( pResult
&& *pResult
== 1 )
146 m_TrieHasChangedFlag
= TRUE
;
147 m_IsAlphabetized
= FALSE
;
152 CNode
* CTrie::Insert(CStringSurrogate ssS
)
155 int* pResult
= &Result
;
158 m_Root
->Insert(ssS
, m_NumberOfNodes
, &pNode
, pResult
);
160 if ( pResult
&& *pResult
== 1 )
165 m_TrieHasChangedFlag
= TRUE
;
166 m_IsAlphabetized
= FALSE
;
172 //----------------------------------------------------------------
173 ////////////////////////////////////////////////////////////////
174 //----------------------------------------------------------------
178 void CTrie::MakeATerminalPointerArray( CNode
** Array
)
182 m_Root
->MakeATerminalPointerArray( Array
, Index
);
183 Q_ASSERT ( (int) Index
== m_Count
);
189 int CTrie::CountValidSubstrings( CStringSurrogate
& ss
)
191 return m_Root
->CountValidSubstrings( ss
);
196 CNode
* CTrie::Find1 ( CStringSurrogate SS
, bool PartialOK
)
198 CNode
* pNode
= m_Root
->Find1(SS
, PartialOK
);
202 bool CTrie::RemoveFromTrie ( const CStringSurrogate string
, bool RemovePointerFlag
)
204 CNode
* pNode
= m_Root
;
206 m_TrieHasChangedFlag
= TRUE
;
208 for (int i
= pNode
->m_StartPoint
+ 1; i
< pNode
->m_BreakPoint
; i
++ ) {
209 if ( string
[i
] != pNode
->m_Key
[i
] ) {
213 QChar c
= string
[ pNode
->m_BreakPoint
];
214 qNode
= pNode
->FindLetter( c
);
215 if ( qNode
== NULL
) {
218 if (qNode
->GetKey() == string
) {
219 Q_ASSERT ( qNode
->m_Pointer
);
220 qNode
->m_ExistenceFlag
= FALSE
;
222 if ( RemovePointerFlag
&& m_AutoDelete
) {
223 delete qNode
->m_Pointer
;
225 qNode
->m_Pointer
= NULL
;
227 if ( qNode
->m_Letters
== NULL
) {
231 QChar
* NewLetters
= new QChar
[ pNode
->GetNumberOfBranches() - 1 ];
232 CNode
** NewPointers
= new CNode
* [ pNode
->GetNumberOfBranches() - 1 ];
234 for (int j
= 0; j
< (int)pNode
->GetNumberOfBranches(); j
++) {
235 if ( c
== pNode
->m_Letters
[j
] ) { continue; }
236 NewLetters
[k
] = pNode
->m_Letters
[j
];
237 NewPointers
[k
] = pNode
->m_Pointers
[j
];
240 Q_ASSERT ( k
== (int)pNode
->GetNumberOfBranches()-1 );
241 delete [] pNode
->m_Letters
; pNode
->m_Letters
= NewLetters
;
242 delete [] pNode
->m_Pointers
; pNode
->m_Pointers
= NewPointers
;
243 pNode
->SetNumberOfBranches(k
);
246 qNode
->DoesNotExist();
261 CNode
* CTrie::SearchForPrefix(CStringSurrogate
& CStringSurrogate
, int& Result
)
265 pNode
= m_Root
->SearchForPrefix (CStringSurrogate
, Result
);
273 //----------------------------------------------------------------
274 ////////////////////////////////////////////////////////////////
275 //----------------------------------------------------------------
280 CNode
* CTrie::GetRoot1()
285 //----------------------------------------------------------------
286 ////////////////////////////////////////////////////////////////
287 //----------------------------------------------------------------
290 void CTrie::Alphabetize()
292 if ( m_IsAlphabetized
== FALSE
)
294 m_Root
->Alphabetize();
296 m_IsAlphabetized
= TRUE
;
299 int CTrie::ComputeNumberOfEntries()
301 return m_Root
->ComputeNumberOfEntries(0);
304 void CTrie::CreateNodeArray()
306 // XXX. avoid delete/new cycle if there’s room
307 delete[] m_NodeArray
;
308 m_NodeArray
= new CNode
*[m_NumberOfNodes
];
311 m_Root
->CreateNodeArray(m_NodeArray
, Index
);
313 Q_ASSERT(Index
= GetCount());
316 void CTrie::CreateSuffixSignatures(const CStringSurrogate
& prefix
, CParse
* out
)
317 { CreateSuffixSignatures(&prefix
, out
); }
319 //----------------------------------------------------------------------------
320 void CTrie::CreateSuffixSignatures ( const CStringSurrogate
* Prefix
, CParse
* pSignature
)
322 // This finds the location of Prefix in the Trie, and takes all of the
323 // suffixes to it and creates a "signature" of them.
327 CStringSurrogate ssInitial
;
328 CStringSurrogate ssPrefix
= *Prefix
;
333 pNode
= SearchForPrefix ( ssPrefix
, nResult
);
335 Q_ASSERT ( nResult
!= 0 );
337 if ( pNode
&& nResult
== 2 )
339 ssInitial
= pNode
->GetKey().Mid( ssPrefix
.GetLength() );
342 int Offset
= ssPrefix
.GetLength();
343 if( pNode
) pNode
->FindAllTerminalsBelowHere( pSignature
, Offset
);
347 //------------------------------------------------------------------------------------
348 // find the deepest node in the Trie whose count is more than half of m_Count
349 CNode
* CTrie::FindLowestMajorityNode()
351 CNode
* pNode
= m_Root
->FindLowestMajorityNode( m_Count
);
353 if ( pNode
== m_Root
) { return NULL
; }
362 void CTrie::MakeAllNodesVisible(bool Flag
)
364 m_Root
->MakeAllVisible(Flag
);
368 bool CTrie::MakeVisible( const CStringSurrogate string
)
370 return m_Root
->MakeVisible(string
);
375 //----------------------------------------------------------------//
376 ////////////////////////////////////////////////////////////////
377 //----------------------------------------------------------------//
379 // New functions added for FSA
382 void CTrie::MakeMorphemeBoundariesAtThisWidth(int n
, int MinimumStemLength
)
386 m_Root
->MakeMorphemeBoundariesAtThisWidth(n
, MinimumStemLength
, ThisLength
);
393 //----------------------------------------------------------------//
394 ////////////////////////////////////////////////////////////////
395 //----------------------------------------------------------------//
397 void CTrie::MakeCutsAtMorphemeBoundary()
400 m_Root
->MakeCutsAtMorphemeBoundary( depth
);
404 void CTrie::SetAutoDelete( bool b
)
407 m_Root
->SetAutoDelete(b
);
413 /////////////////////////////////////////////////////////////
415 CNode::CNode(QString s
, int StartPoint
, int BreakPoint
)
416 : m_Key(new QChar
[s
.length()]), // filled below
417 m_KeyLength(s
.length()),
421 m_NumberOfBranches(0),
425 m_AutoDelete(true), // *this owns m_Pointer
427 m_StartPoint(StartPoint
),
428 m_BreakPoint(BreakPoint
),
429 m_ExistenceFlag(false),
430 m_MorphemeBoundary(false)
431 { LxStrCpy(s
, m_Key
, s
.length()); }
433 CNode::CNode(const CStringSurrogate
& SS
, int StartPoint
, int BreakPoint
)
434 : m_Key(new QChar
[SS
.GetLength()]), // filled below
435 m_KeyLength(SS
.GetLength()),
439 m_NumberOfBranches(0),
443 m_AutoDelete(true), // *this owns m_Pointer
445 m_StartPoint(StartPoint
),
446 m_BreakPoint(BreakPoint
),
447 m_ExistenceFlag(false),
448 m_MorphemeBoundary(false)
449 { LxStrCpy(&SS
, m_Key
, SS
.GetLength()); }
457 m_NumberOfBranches(0),
461 m_AutoDelete(true), // *this owns m_Pointer
465 m_ExistenceFlag(false),
466 m_MorphemeBoundary(false) { }
470 // Delete child nodes
471 if (m_Letters
!= 0) {
472 for (int i
=0; i
< m_NumberOfBranches
; ++i
)
473 delete m_Pointers
[i
];
477 // If we own it, delete data
485 void CNode::SetAutoDelete( bool b
)
488 for( int i
=0; m_Letters
&& i
< m_NumberOfBranches
; i
++ )
490 m_Pointers
[i
]->SetAutoDelete(b
);
495 int CNode::CountValidSubstrings( CStringSurrogate
& ss
)
499 if( m_ExistenceFlag
) count
++;
501 CNode
* pNode
= FindLetter( ss
[0] );
504 int step
= pNode
->m_BreakPoint
- m_BreakPoint
;
505 CStringSurrogate ssMid
= ss
.Mid( step
);
506 count
+= pNode
->CountValidSubstrings( ssMid
);
513 void CNode::FindSFBumps ( CMorphemeCollection
& TempMorphemes
, int SFThreshold
)
515 if ( GetWidth() > SFThreshold
)
517 TempMorphemes
<< CStringSurrogate( m_Key
, 0, m_KeyLength
);
520 for (int i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
522 m_Pointers
[i
]->FindSFBumps ( TempMorphemes
, SFThreshold
);
527 void CNode::BreakAllWordsBelowHere(int BreakLocation
, CWordCollection
* Words
)
531 CStringSurrogate ssWord
;
533 if ( m_KeyLength
> 2 && m_KeyLength
> BreakLocation
+ 1)
535 ssWord
= CSS(m_Key
, 0, m_KeyLength
);
537 ssWord
= ssWord
.Mid(1, ssWord
.GetLength() - 2 );// eliminate first and last #
538 pWord
= *Words
^= ssWord
;
540 pWord
->CutRightBeforeHere (BreakLocation
);
545 CNode
* CNode::Insert(CStringSurrogate
& SS
, int& NumberOfNodes
,
546 CNode
** ppNode
, int* pResult
)
552 // The node that is returned is the node immediately down from
553 // node that called this function in the first place.
554 // It may be "This" or it may not.
555 // ppNode is a pointer to the Terminal node that is eventually
556 // identified or created for the string in question.
557 CStringSurrogate ssKey
= GetKey();
558 auto_ptr
<CNode
> this_node(this);
560 // Result: 1: new node added
561 // 2: node already existed
563 //-----------------------------------------------------------------------//
565 // 1: A difference before m_BreakPoint: it's definitely a new string.
566 // should i be initialized as m_StartPoint + 1??
567 int i
= m_StartPoint
;
568 for (; i
< m_BreakPoint
&& i
< SS
.GetLength(); ++i
) {
569 // XXX. SS and ssKey could be backwards, hence the obscure test.
570 // if ( SS[i] != ssKey[i] )
571 if ( LxStrCmp( &SS
, ssKey
.GetKey(),1,1,SS
.GetStart()+i
,ssKey
.GetStart()+i
) ) {
572 // we create a new node that dominates *this and also
573 // dominates the other new node created, new_node.
575 auto_ptr
<CNode
> prev(new CNode(ssKey
.Left(i
), m_StartPoint
, i
));
576 prev
->GetLink(this_node
.release());
580 auto_ptr
<CNode
> new_node(new CNode(SS
, i
, SS
.GetLength()));
582 CNode
* inserted
= new_node
.get();
583 prev
->GetLink(new_node
.release());
589 if (pResult
) *pResult
= 1;
591 prev
->SetCountBelow((m_ExistenceFlag
? 0 : 1) + m_CountBelow
+ 1);
592 prev
->SetCorpusCount(m_CorpusCount
+ 1);
593 inserted
->SetCorpusCount(1);
595 prev
->SetAutoDelete(m_AutoDelete
);
596 inserted
->SetAutoDelete(m_AutoDelete
);
598 return prev
.release();
601 // Now i == m_BreakPoint or i == SS.GetLength(),
602 // and m_Key[j] == SS.m_Key[j] for all j < i
604 //-----------------------------------------------------------------------//
606 // 2. "s" is shorter than this node's Key:
607 if (i
< m_BreakPoint
&& i
== SS
.GetLength()) {
608 auto_ptr
<CNode
> prev(new CNode(ssKey
.Left(i
), m_StartPoint
, i
));
609 prev
->GetLink(this_node
.release());
613 *ppNode
= prev
.get();
616 if (pResult
) *pResult
= 1;
618 prev
->SetCountBelow((m_ExistenceFlag
? 0 : 1) + m_CountBelow
);
619 prev
->SetCorpusCount(m_CorpusCount
+ 1);
620 prev
->SetAutoDelete(m_AutoDelete
);
622 return prev
.release();
625 //-----------------------------------------------------------------------//
627 // 3. "s" is exactly this node's key:
628 if (i
== m_BreakPoint
&& i
== SS
.GetLength()) {
637 SetCorpusCount(m_CorpusCount
+ 1);
641 return this_node
.release();
644 //-----------------------------------------------------------------------//
645 // 4. This node only takes us part way into the word:
646 QChar Letter
= SS
[m_BreakPoint
];
647 for (int j
= 0; j
< m_NumberOfBranches
; ++j
) {
648 if (m_Letters
[j
] == Letter
) {
649 CNode
* next
= m_Pointers
[j
];
651 auto_ptr
<CNode
> new_node(next
->Insert(SS
, NumberOfNodes
,
653 if (pResult
) *pResult
= result
;
654 m_Pointers
[j
] = new_node
.release();
661 return this_node
.release();
665 //-----------------------------------------------------------------------//
666 // 5. the letter at BreakPoint is new:
667 auto_ptr
<CNode
> new_node(new CNode(SS
, m_BreakPoint
, SS
.GetLength()));
669 new_node
->SetCorpusCount(1);
670 new_node
->SetAutoDelete(m_AutoDelete
);
672 *ppNode
= new_node
.get();
674 int length
= m_NumberOfBranches
;
676 // XXX. use realloc work-alike
677 QChar
* NewLetters
= new QChar
[length
+ 1];
678 CNode
** NewPointers
= new CNode
*[length
+ 1];
679 copy(m_Letters
, m_Letters
+ length
, NewLetters
);
680 copy(m_Pointers
, m_Pointers
+ length
, NewPointers
);
681 NewLetters
[length
] = Letter
;
682 NewPointers
[length
] = new_node
.release();
684 swap(m_Letters
, NewLetters
);
685 swap(m_Pointers
, NewPointers
);
686 ++m_NumberOfBranches
;
689 delete[] NewPointers
;
692 if (pResult
) *pResult
= 1;
697 return this_node
.release();
699 //-----------------------------------------------------------------------//
700 // end ///////////////////////
704 CNode
** CNode::GetLink ( const CStringSurrogate s
)
708 QChar c
= s
[m_BreakPoint
];
713 length
= m_NumberOfBranches
;
715 for ( i
= 0; i
< length
; i
++)
717 if (m_Letters
[i
] == c
) return &m_Pointers
[i
];
720 QChar
* NewLetters
= new QChar
[length
+ 1];
721 CNode
** NewPointers
= new CNode
*[length
+ 1];
722 for ( i
= 0; i
< length
; i
++)
724 NewLetters
[i
] = m_Letters
[i
];
725 NewPointers
[i
] = m_Pointers
[i
];
727 NewLetters
[length
] = c
;
728 m_NumberOfBranches
++;
730 m_Letters
= NewLetters
;
732 m_Pointers
= NewPointers
;
733 m_Pointers
[length
] = new CNode(s
, m_BreakPoint
, m_BreakPoint
);
734 return &m_Pointers
[length
];
738 CNode
* CNode::FindLetter (QChar c
)
742 if (m_Letters
) { length
= m_NumberOfBranches
;}
743 for (int i
= 0; i
< length
; i
++)
745 if (m_Letters
[i
] == c
) return m_Pointers
[i
];
752 //int CNode::GetCount() const
758 CNode
** CNode::GetLink(CNode
* pNode
)
763 int length
= m_NumberOfBranches
;
765 // case 1: pNode is terminal, has same Key as this:
766 if (pNode
->GetKeyLength() == m_BreakPoint
)
769 QChar c
= pNode
->GetKey()[m_BreakPoint
];
771 for (int i
= 0; i
< length
; ++i
)
772 if (m_Letters
[i
] == c
)
773 return &m_Pointers
[i
];
775 // XXX. should use realloc workalike
776 QChar
* NewLetters
= new QChar
[length
+ 1];
777 CNode
** NewPointers
= new CNode
*[length
+ 1];
778 copy(m_Letters
, m_Letters
+ length
, NewLetters
);
779 copy(m_Pointers
, m_Pointers
+ length
, NewPointers
);
780 NewLetters
[length
] = c
;
781 NewPointers
[length
] = pNode
;
783 swap(m_Letters
, NewLetters
);
784 swap(m_Pointers
, NewPointers
);
786 delete[] NewPointers
;
788 ++m_NumberOfBranches
;
789 return &m_Pointers
[length
];
792 void CNode::MakeCutsWhereCountExceeds( int count
, int start
, int end
)
794 CStringSurrogate
ssKey(m_Key
,0,m_KeyLength
);
795 // QString spaces = "\n";
798 if( start
< m_KeyLength
&& end
+1 >= m_KeyLength
&& m_NumberOfBranches
>= count
)
800 // for( i=0; i < m_KeyLength; i++ ) spaces += " ";
801 CutAllWordsHereAndBelow_AfterNLetters ( m_KeyLength
);
804 if( end
+1 >= m_KeyLength
)
806 for( i
=0; m_Letters
&& i
< m_NumberOfBranches
; i
++ )
808 m_Pointers
[i
]->MakeCutsWhereCountExceeds( count
, start
, end
);
814 CStringSurrogate
CNode::GetKey()
816 return CStringSurrogate(m_Key
,0,m_KeyLength
);
820 void CNode::MakeATerminalPointerArray (CNode
** Array
, int& Index
)
825 Array
[ Index
] = this;
829 for (int i
= 0 ; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
831 m_Pointers
[i
]->MakeATerminalPointerArray(Array
, Index
);
837 CNode
* CNode::Find1(const CStringSurrogate string
, bool PartialOK
)
839 if (m_Key
&& LxStrCmp( &string
, m_Key
, string
.GetLength(), m_KeyLength
) == 0 && ( PartialOK
|| m_ExistenceFlag
) )
846 if( string
.GetLength() < m_BreakPoint
)
848 if( !PartialOK
) return NULL
;
851 for( int i
= m_StartPoint
; i
< m_BreakPoint
-1; i
++ )
853 if( string
[i
] != m_Key
[i
] )
855 if( PartialOK
&& ( i
> m_StartPoint
) ) return this;
861 QChar c
= string
[m_BreakPoint
];
862 for ( int i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
864 if ( c
== m_Letters
[i
] )
866 return m_Pointers
[i
] ->Find1 (string
);
873 CNode
* CNode::SearchForPrefix ( CStringSurrogate
& SS
, int& Result
, int Letterno
)
876 Q_ASSERT ( Letterno
== m_StartPoint
);
877 if ( Letterno
< m_BreakPoint
-1)
879 for (int i
= Letterno
+ 1; i
< m_BreakPoint
; i
++)
881 if ( i
== SS
.GetLength() )
885 // Prefix is inside of this node, however!
887 if ( SS
[i
] != m_Key
[i
] )
897 Letterno
= m_BreakPoint
;
899 if ( m_Key
&& SS
== CStringSurrogate( m_Key
, 0, m_KeyLength
) )
905 for (int i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
907 if ( SS
[Letterno
] == m_Letters
[i
] )
909 pNode
= FindLetter ( SS
[m_BreakPoint
] );
910 pNode
= pNode
->SearchForPrefix ( SS
, Result
, m_BreakPoint
);
919 void CNode::ListDisplay(Q3ListView
* List
, LinguisticaMainWindow
* doc
,
920 int* count
, QMap
<QString
, QString
>* filter
, bool ReverseFlag
)
923 CStringSurrogate
ssKey(m_Key
,0,m_KeyLength
);
928 below
= " Count below: ";
929 below
+= QString("%1").arg(m_CountBelow
);
934 // Update on multiples of 100
935 if( (*count
) % 100 == 0 )
936 doc
->status_display().progress
= *count
;
942 Output
.append ( ssKey
.Display(filter
) );
943 if ( ReverseFlag
== TRUE
)
945 Output
.append( " (" );
947 LxStrCpy_R( ssKey
.Display(filter
),reverse
,ssKey
.GetLength(), ssKey
.GetStart() );
948 Output
.append( reverse
+ ")" );
953 if (m_ExistenceFlag
)
955 Output
.append (star
);
957 Output
.append ( width
);
958 Output
+= QString("%1").arg( GetWidth() );
961 Output
.append (lt_curly
);
962 Output
.append ( CStringSurrogate( m_Letters
, 0, m_NumberOfBranches
).Display(filter
) );
963 Output
.append (rt_curly
);
966 Output
.append ( below
);
968 CTrieListViewItem
* item
= new CTrieListViewItem( List
, Output
);
969 item
->setSelectable(false);
972 for (int i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
974 m_Pointers
[i
] ->ListDisplay (item
, doc
, count
, filter
, ReverseFlag
);
979 void CNode::ListDisplay(Q3ListViewItem
* List
, LinguisticaMainWindow
* doc
,
980 int* count
, QMap
<QString
, QString
>* filter
, bool ReverseFlag
)
983 CStringSurrogate
ssKey(m_Key
,0,m_KeyLength
);
988 below
= " Count below: ";
989 below
+= QString("%1").arg(m_CountBelow
);
994 // Update on multiples of 100
995 if( (*count
) % 100 == 0 )
996 doc
->status_display().progress
= *count
;
1002 Output
.append ( ssKey
.Display(filter
) );
1003 if ( ReverseFlag
== TRUE
)
1005 Output
.append( " (" );
1007 LxStrCpy_R( ssKey
.Display(filter
),reverse
,ssKey
.GetLength(), ssKey
.GetStart() );
1008 Output
.append( reverse
+ ")" );
1013 if (m_ExistenceFlag
)
1015 Output
.append (star
);
1017 Output
.append ( width
);
1018 Output
+= QString("%1").arg( GetWidth() );
1021 Output
.append (lt_curly
);
1022 Output
.append ( CStringSurrogate( m_Letters
, 0, m_NumberOfBranches
).Display(filter
) );
1023 Output
.append (rt_curly
);
1026 Output
.append ( below
);
1028 CTrieListViewItem
* item
= new CTrieListViewItem( List
, Output
);
1029 item
->setSelectable(false);
1030 item
->setOpen(true);
1032 for (int i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
1034 m_Pointers
[i
] ->ListDisplay (item
, doc
, count
, filter
, ReverseFlag
);
1039 int CNode::GetWidth()
1042 if (m_ExistenceFlag
) { width
= 1;}
1044 if (m_Pointers
) { return m_NumberOfBranches
+ width
; } else {return width
; }
1049 // helper for CNode::Alphabetize
1050 // index_in<char>("Hello")(0) == 'H'.
1051 template<class T
> class index_in
: public std::unary_function
<int, T
> {
1054 index_in(const T
* v
) : array(v
) { }
1055 T
operator()(int i
) { return array
[i
]; }
1059 void CNode::Alphabetize()
1063 using std::transform
;
1066 const int Length
= m_NumberOfBranches
;
1072 QChar
* NewLetters
= new QChar
[Length
];
1073 CNode
** NewPointers
= new CNode
*[Length
];
1075 std::vector
<int> shuffle(Length
);
1076 SortLetters(&shuffle
[0], m_Letters
, Length
);
1078 transform(shuffle
.begin(), shuffle
.end(),
1079 NewLetters
, index_in
<QChar
>(m_Letters
));
1080 transform(shuffle
.begin(), shuffle
.end(),
1081 NewPointers
, index_in
<CNode
*>(m_Pointers
));
1082 swap(m_Letters
, NewLetters
);
1083 swap(m_Pointers
, NewPointers
);
1085 delete[] NewLetters
;
1086 delete[] NewPointers
;
1088 std::for_each(m_Pointers
, m_Pointers
+ Length
,
1089 mem_fun(&CNode::Alphabetize
));
1093 void* CNode::Get_T_Pointer()
1095 if (m_ExistenceFlag
)
1106 int CNode::ComputeNumberOfEntries(int count
)
1108 if (m_ExistenceFlag
)
1113 for (int i
= 0 ; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
1115 count
= m_Pointers
[i
]->ComputeNumberOfEntries(count
);
1121 void CNode::CreateNodeArray(CNode
** NodeArray
, int& Index
)
1124 NodeArray
[Index
] = this;
1127 for (int i
= 0 ; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
1129 m_Pointers
[i
]->CreateNodeArray(NodeArray
, Index
);
1135 void CNode::FindAllTerminalsBelowHere( CParse
* pSuffixes
, int Offset
)
1137 CStringSurrogate ssPiece
;
1138 if( m_ExistenceFlag
)
1140 ssPiece
= CStringSurrogate( m_Key
,0,m_KeyLength
);
1141 pSuffixes
->Append( ssPiece
.Mid( Offset
) );
1144 for( int i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++ )
1146 m_Pointers
[i
]->FindAllTerminalsBelowHere ( pSuffixes
, Offset
);
1152 CNode
* CNode::FindLowestMajorityNode(int Count
)
1154 bool FoundFlag
= FALSE
;
1158 for (int i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
1160 if ( m_Pointers
[i
]->GetCountBelow() > Count
/ 2 )
1163 return pNode
= m_Pointers
[i
]->FindLowestMajorityNode( Count
);
1168 if ( FoundFlag
== FALSE
)
1170 if (m_CountBelow
> Count
/ 2 ) { return this; }
1181 void CNode::DumpVisibleToLogFile(QTextOStream
* stream
, bool ReverseFlag
)
1183 if (m_Visible
&& m_ExistenceFlag
)
1187 QChar
* string
= new QChar
[ m_KeyLength
+ 1];
1188 LxStrCpy_R(m_Key
, string
, m_KeyLength
);
1189 *stream
<< string
<< endl
;
1193 *stream
<< m_Key
<< endl
;
1195 for (int i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
1197 m_Pointers
[i
]->DumpVisibleToLogFile(stream
, ReverseFlag
);
1204 void CNode::DumpVisibleWords ( CWordCollection* Words, bool ReverseFlag )
1206 if (m_Visible && m_ExistenceFlag )
1210 *Words << CStringSurrogate(m_Key, 0, m_KeyLength);
1214 *Words << CStringSurrogate(m_Key, 0, m_KeyLength);
1217 for (int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
1219 m_Pointers[i]->DumpVisibleWords(Words, ReverseFlag);
1225 void CNode::MakeAllVisible(bool Flag
)
1228 for (int i
= 0 ; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
1230 m_Pointers
[i
]->MakeAllVisible(Flag
);
1235 bool CNode::MakeVisible( const CStringSurrogate string
)
1237 if (m_Key
&& LxStrCmp( &string
, m_Key
, string
.GetLength(), m_KeyLength
) == 0 && m_ExistenceFlag
)
1245 if ( string
.GetLength() < m_BreakPoint
)
1249 for (int i
= m_StartPoint
; i
< m_BreakPoint
-1;i
++)
1251 if ( string
[i
] != m_Key
[i
] )
1259 QChar c
= string
[m_BreakPoint
];
1260 for ( int i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
1262 if ( c
== m_Letters
[i
] )
1264 return m_Pointers
[i
] ->MakeVisible (string
);
1271 bool CNode::MakeMorphemeBoundariesAtThisWidth(int n
, int MinimumStemLength
, int ThisLength
)
1274 bool bLowerIsBoundary
= FALSE
;
1276 int NumberOfBranches
=0;
1278 // QString DebugString;
1279 // const char* DebugStringDisplay;
1285 KeyLen
= GetKeyLength();
1286 ThisLength
= KeyLen
;
1288 // DebugString = QString(m_Key, KeyLen);
1289 // DebugStringDisplay = DebugString.ascii();
1293 NumberOfBranches
= GetNumberOfBranches();
1295 for (int i
= 0 ; m_Letters
&& i
< NumberOfBranches
; i
++)
1297 val
= m_Pointers
[i
]->MakeMorphemeBoundariesAtThisWidth(n
, MinimumStemLength
, ThisLength
);
1298 if (val
) bLowerIsBoundary
= TRUE
;
1301 if ( ThisLength
>= MinimumStemLength
&& GetWidth() >= n
)
1303 m_MorphemeBoundary
= TRUE
;
1307 m_MorphemeBoundary
= false;
1310 return m_MorphemeBoundary
;
1314 void CNode::CutAllWordsHereAndBelow_AfterNLetters ( int n
)
1316 int NumberOfBranches
=0;
1318 if ( m_ExistenceFlag
)
1320 ((CStem
*)m_Pointer
)->CutRightBeforeHere (n
);
1323 NumberOfBranches
= GetNumberOfBranches();
1325 for (int i
= 0 ; m_Letters
&& i
< NumberOfBranches
; i
++)
1327 m_Pointers
[i
]->CutAllWordsHereAndBelow_AfterNLetters ( n
);
1332 void CNode::MakeCutsAtMorphemeBoundary( int depth
)
1334 int ThisDepth
= depth
;
1336 int NumberOfBranches
=0;
1340 KeyLen
= GetKeyLength();
1344 if ( m_MorphemeBoundary
)
1346 CutAllWordsHereAndBelow_AfterNLetters ( ThisDepth
);
1349 NumberOfBranches
= GetNumberOfBranches();
1351 for (int i
= 0 ; m_Letters
&& i
< NumberOfBranches
; i
++)
1353 m_Pointers
[i
]->MakeCutsAtMorphemeBoundary(ThisDepth
);
1357 int CTrie::ProjectTrieStructureOntoWord( CParse
* parse
)
1361 parse
->ClearParseStructure();
1372 status
= m_Root
->ProjectTrieStructureOntoWord( parse
);
1385 int CNode::ProjectTrieStructureOntoWord( CParse
* parse
, int position
)
1387 CStringSurrogate ssKey
= GetKey();
1388 CStringSurrogate SS
= parse
->GetKey();
1390 int newPosition
= position
;
1394 // Result: -1: word doesn't exist
1397 //-----------------------------------------------------------------------//
1399 // 1: A difference before m_BreakPoint: it's definitely a new string.
1400 int i
= m_StartPoint
;
1401 for (; (SS
.GetLength() > 0) && m_Key
&& i
< m_BreakPoint
&& i
< SS
.GetLength(); i
++)
1405 // if ( SS[i] != ssKey[i] )
1406 if ( LxStrCmp( &SS
, ssKey
.GetKey(),1,1,SS
.GetStart()+i
,ssKey
.GetStart()+i
) )
1412 //-----------------------------------------------------------------------//
1414 // 2. "s" is shorter than this node's Key:
1415 if ( i
< m_BreakPoint
&& i
== SS
.GetLength() )
1420 //-----------------------------------------------------------------------//
1422 // 3. "s" is exactly this node's key:
1423 if ( m_Key
&& LxStrCmp ( &SS
, m_Key
, SS
.GetLength(), m_KeyLength
, SS
.GetStart() ) == 0 )
1425 parse
->SetPieceValue( parse
->Size(), (double) m_CorpusCount
);
1431 //-----------------------------------------------------------------------//
1432 // 4. This node only takes us part way into the word:
1433 QChar Letter
= SS
[m_BreakPoint
];
1434 for (int j
= 0; m_Letters
&& j
< (int)m_NumberOfBranches
; j
++)
1436 if ( m_Letters
[j
] == Letter
)
1440 parse
->SetPieceValue( parse
->Size(), (double) m_CorpusCount
);
1441 parse
->CutNFromTheLeft( newPosition
);
1444 return m_Pointers
[j
]->ProjectTrieStructureOntoWord( parse
, newPosition
);
1449 //-----------------------------------------------------------------------//
1450 // 5. the letter at BreakPoint is new:
1453 //-----------------------------------------------------------------------//
1454 // end ///////////////////////
1457 QString
CTrie::Display()
1459 return QString( m_Root
->Display( 0, m_ReverseFlag
) + "=========================" );
1462 QString
CNode::Display( int tabs
, bool ReverseFlag
)
1464 QString Output
= " ";
1469 below
= " Count below: ";
1470 below
+= QString("%1").arg(m_CountBelow
);
1472 CStringSurrogate
ssKey(m_Key
,0,m_KeyLength
);
1475 for( i
= 0; i
< tabs
; i
++ )
1477 Output
.append( "__" );
1479 Output
.append( " " );
1483 Output
.append ( ssKey
.Display() );
1484 if ( ReverseFlag
== TRUE
)
1486 Output
.append( " (" );
1488 LxStrCpy_R( ssKey
.Display(),reverse
,ssKey
.GetLength(), ssKey
.GetStart() );
1489 Output
.append( reverse
+ ")" );
1494 if (m_ExistenceFlag
)
1496 Output
.append (star
);
1498 // Output.append ( width );
1499 // Output += QString("%1").arg( GetWidth() );
1502 Output
.append (lt_curly
);
1503 Output
.append ( CStringSurrogate( m_Letters
, 0, m_NumberOfBranches
).Display() );
1504 Output
.append (rt_curly
);
1507 // Output.append ( below );
1509 Output
.append ( "\n" );
1511 for (i
= 0; m_Letters
&& i
< (int)m_NumberOfBranches
; i
++)
1513 Output
.append( m_Pointers
[i
]->Display( tabs
+1, ReverseFlag
) );