1 // Implementation of finite-state automaton class
2 // Copyright © 2009 The University of Chicago
8 #include <QTableWidgetItem>
10 #include "MiniLexicon.h"
11 #include "Signature.h"
13 #include "SignatureCollection.h"
14 #include "StemCollection.h"
15 #include "WordCollection.h"
16 #include "SuffixCollection.h"
21 #include "PrefixCollection.h"
25 #include <graphviz/gvc.h>
28 #define TEST_FUNC_MAX 32
32 double FSAMorpheme::GetDL(int characterCount
) const
34 if (this->str
== "NULL")
36 return double(this->str
.size()) * base2log(characterCount
);
39 bool FSAMorpheme::operator==(const FSAMorpheme
& other
) const
41 return this->toStr() == other
.toStr();
44 namespace linguistica
{
45 namespace join_helper
{
46 class cat_with_delim
: public std::binary_function
<
47 QString
, const FSAMorpheme
*, QString
> {
50 cat_with_delim(const QString
& d
) : delim(d
) { }
51 QString
operator()(const QString
& lhs
,
52 const FSAMorpheme
* rhs
) const
56 result
.append(rhs
->toStr());
63 QString
join(const FSAMorphemeList
& v
, const QString
& delim
)
65 using linguistica::join_helper::cat_with_delim
;
66 using std::accumulate
;
68 FSAMorphemeList::const_iterator iter
= v
.begin();
73 const QString first
= (*iter
)->toStr();
74 return accumulate(++iter
, v
.end(), first
, cat_with_delim(delim
));
77 FSAstate::FSAstate(FSA
* pFSA
)
78 : m_EdgesOut(), m_EdgesIn(),
79 m_MaxDepth(-1), //m_stateLabel(),
80 m_PathList(), m_DiscoverCount(0),
81 m_stateId(pFSA
->m_nextStateId
++)
82 { pFSA
->AddState(this); }
84 /*int FSAstate::getWordCount() const
88 for(int j=0; j<this->GetEdgesIn()->size();j++) {
89 FSAedge* pEdge = this->GetEdgesIn()->at(j);
90 for(int k=0; k<pEdge->GetMorphemes()->size(); k++)
92 sumIn += pEdge->GetMorphemes()->at(k)->m_corpusCount;
96 for(int j=0; j<this->GetEdgesOut()->size();j++) {
97 FSAedge* pEdge = this->GetEdgesOut()->at(j);
98 for(int k=0; k<pEdge->GetMorphemes()->size(); k++)
99 sumOut += pEdge->GetMorphemes()->at(k)->m_corpusCount;
102 Q_ASSERT(sumIn==0 || sumOut==0 || sumOut==sumIn);
103 Q_ASSERT(sumOut+sumIn > 0);
112 void FSAstate::AddEdgeOut(FSAedge
* pEdge
)
114 this->m_EdgesOut
.push_back(pEdge
);
117 void FSAstate::AddEdgeIn(FSAedge
* pEdge
)
119 this->m_EdgesIn
.push_back(pEdge
);
121 //----------------------------------------------------------------//
122 FSAedge::FSAedge(FSA
* pFSA
, FSAstate
* pStartState
, FSAstate
* pEndState
)
123 : m_Morphemes(), m_FromState(pStartState
), m_ToState(pEndState
), m_FSA(pFSA
)
128 FSAedge::FSAedge(FSA* pFSA, FSAstate* pStartState, FSAstate* pEndState, CParse* pLabels)
129 : m_Morphemes(), // initialized below
130 m_FromState(pStartState),
134 pLabels->CreateQStringList(m_Morphemes);
137 //void FSAedge::AddMorpheme(Morpheme* pMorph)
138 void FSAedge::AddMorpheme(const QString
& str
, int addedCount
)
140 Q_ASSERT(addedCount
);
141 //if Morpheme not included in FSA, update
142 if(this->m_FSA
->m_morphemes
.contains(str
))
144 FSAMorpheme
* pMorph
= this->m_FSA
->m_morphemes
[str
];
145 pMorph
->m_corpusCount
+=addedCount
;
147 Q_ASSERT(1); //should not be called now, we are not adding stems or suffixes
148 FSAMorpheme
* pMorph
= new FSAMorpheme(str
,addedCount
);
149 this->m_FSA
->m_morphemes
.insert(str
, pMorph
);
152 //Q_ASSERT(!this->m_FSA->m_morphemeEdges.values(str).contains(this));
153 //this->m_FSA->m_morphemeEdges.insert(str,this);
154 this->m_Morphemes
.push_back( new FSAMorpheme(str
,addedCount
) );
157 void FSAedge::RemoveMorpheme(FSAMorphemeList::iterator iter
)
159 FSAMorpheme
* pMorph
= *iter
;
160 QString str
= pMorph
->toStr();
161 int count
= pMorph
->m_corpusCount
;
162 //this->GetMorphemes()->remove(pMorph);
163 this->GetMorphemes()->erase(iter
);
165 // Do cleanup at FSA level
166 FSAMorpheme
* FsaMorph
= this->m_FSA
->m_morphemes
[str
];
168 Q_ASSERT(FsaMorph
->m_corpusCount
>= count
);
169 //std::cout << "Removing: " << pMorph->toStr().toStdString() << " FSA count: " << FsaMorph->m_corpusCount << " This edge count:" << count << std::endl;
170 if(count
== FsaMorph
->m_corpusCount
) //remove from FSA
171 this->m_FSA
->m_morphemes
.remove(str
);
173 this->m_FSA
->m_morphemes
[str
]->m_corpusCount
-=count
;
178 FSAedge
* FSA::DoParallelSplit(FSAedge
* pEdge
, FSAMorphemeList
& morphsToMove
)
180 Q_ASSERT(!morphsToMove
.empty());
181 //if(morphsToMove.empty()) return NULL;
183 if(morphsToMove
.size() == pEdge
->GetMorphemes()->size())
186 FSAMorphemeList copy1
= *pEdge
->GetMorphemes();
187 FSAMorphemeList copy2
= morphsToMove
;
190 Q_ASSERT(copy1
==copy2
);
195 FSAMorphemeList
* pEdgeMorphemes
= pEdge
->GetMorphemes();
196 foreach (FSAMorpheme
* pMorph
, morphsToMove
) {
197 FSAMorphemeList::iterator m_it
=
198 std::find(pEdgeMorphemes
->begin(),pEdgeMorphemes
->end(),pMorph
);
199 Q_ASSERT(m_it
!=pEdgeMorphemes
->end());
202 //just move dynamically allocated FSAMorph object from one edge to another.
203 // No other bookkeeping needed, but watch for this if defn any of FSA classes
204 // change. Note that morpheme counts are tracked at FSA level, but split does
205 // not change these counts.
206 FSAedge
* pNewEdge
= new FSAedge(this, pEdge
->GetFromState(), pEdge
->GetToState());
207 foreach (FSAMorpheme
*pMorph
,morphsToMove
) {
208 pEdge
->GetMorphemes()->remove(pMorph
);
209 pNewEdge
->GetMorphemes()->push_back(pMorph
);
214 void FSA::DoMultParallelSplit( FSAedge
* pEdge
,
215 std::list
<FSAMorphemeList
>::iterator first
,
216 std::list
<FSAMorphemeList
>::iterator last
)
218 std::list
<FSAMorphemeList
>::iterator list_it
= first
;
223 foreach(FSAMorpheme
* pMorph
,*list_it
++)
224 all
.push_back(pMorph
);
226 Q_ASSERT(all
.size()==pEdge
->GetMorphemes()->size());
228 FSAMorphemeList copy
= *pEdge
->GetMorphemes();
231 list_it
=first
; //reset iterator
233 //remove 1 non-empty list, some morphs will stay on pEdge
234 while((*list_it
++).size()==0)
238 //this->DoParallelSplit(pEdge,*list_it++);
239 FSAMorphemeList
& morphList
= *list_it
++;
241 if(morphList
.size()==0) continue; //ignore empty lists
243 FSAedge
* pNewEdge
= new FSAedge(this, pEdge
->GetFromState(), pEdge
->GetToState());
244 foreach (FSAMorpheme
*pMorph
,morphList
) {
245 pEdge
->GetMorphemes()->remove(pMorph
);
246 pNewEdge
->GetMorphemes()->push_back(pMorph
);
253 FSAedge
* FSA::DoSeriesSplit(FSAedge
* pEdge
,unsigned int len
, bool suffix
)
255 Q_ASSERT(!pEdge
->GetMorphemes()->empty());
257 QString firstMorph
= pEdge
->GetMorphemes()->front()->toStr();
258 QString commonMorphStr
= ( suffix
? firstMorph
.right(len
) : firstMorph
.left(len
) );
260 FSAstate
* pNewState
= new FSAstate(this);
261 FSAedge
* pEdge1
= new FSAedge(this, pEdge
->GetFromState(),pNewState
);
262 FSAedge
* pEdge2
= new FSAedge(this, pNewState
,pEdge
->GetToState());
264 FSAedge
*commonEdge
, *multiEdge
;
272 unsigned int totalCount
= 0;
274 FSAMorphemeList::iterator m_iter
= pEdge
->GetMorphemes()->begin();
275 while(m_iter
!= pEdge
->GetMorphemes()->end())
277 FSAMorpheme
* pMorph
= *m_iter
;
278 QString morphStr
= pMorph
->toStr();
279 QString curCommon
= ( suffix
? morphStr
.right(len
) : morphStr
.left(len
) );
280 Q_ASSERT( curCommon
== commonMorphStr
);
281 totalCount
+= pMorph
->m_corpusCount
;
283 unsigned int len_other
= morphStr
.length()-len
;
284 QString cur_multi
= ( suffix
? morphStr
.left(len_other
) : morphStr
.right(len_other
) );
286 multiEdge
->AddMorpheme(cur_multi
,pMorph
->m_corpusCount
);
287 pEdge
->RemoveMorpheme(m_iter
++);
290 Q_ASSERT(pEdge
->GetMorphemes()->empty());
291 commonEdge
->AddMorpheme(commonMorphStr
,totalCount
);
293 this->RemoveEdge(pEdge
);
299 : m_States(), m_Edges(), m_StartStates(),
300 m_FSAPathList(), m_searched(false), m_MaxPathLength(0),
301 m_lexicon(), m_charCount(0),
303 m_nextStartStateId(1), m_nextStateId(1) { }
305 FSA::FSA(const FSA
& fsa
)
306 : m_States(fsa
.m_States
), m_Edges(fsa
.m_Edges
), m_StartStates(fsa
.m_StartStates
),
307 m_FSAPathList(fsa
.m_FSAPathList
), m_searched(false), m_MaxPathLength(0),
308 m_lexicon(), m_charCount(0),
309 m_morphemes(fsa
.m_morphemes
),
310 m_nextStartStateId(1), m_nextStateId(1) { }
312 FSA::FSA(CMiniLexicon
* pMiniLexicon
)
313 : m_States(new FSAStateList
),
314 m_Edges(new FSAedgeList
),
315 m_StartStates(new FSAStateList
),
316 m_FSAPathList(), m_searched(false), m_MaxPathLength(0),
317 m_lexicon(pMiniLexicon
),
318 m_charCount(pMiniLexicon
->GetNumberOfCharacterTypes()),
320 m_nextStartStateId(1), m_nextStateId(1)
322 //m_StartState->setMaxDepth(0);
323 // this->AddState(m_StartState);
325 //int MinimumNumberOfStemsForDisplay = 2;
326 //if (pMiniLexicon->GetSignatures()->GetCount() < 20 ) MinimumNumberOfStemsForDisplay = 1;
327 //int MinimumNumberOfAffixesForDisplay = 1; //make this adjustable by user @@@@
329 for (int i
= 0; i
< pMiniLexicon
->GetSignatures()->GetCount(); i
++)
331 CSignature
* pSig
= pMiniLexicon
->GetSignatures()->GetAt(i
);
333 //if (pSig->GetNumberOfStems() < MinimumNumberOfStemsForDisplay ) continue;
334 //if (pSig->Size() < MinimumNumberOfAffixesForDisplay ) continue;
342 while(!m_Edges
->empty())
344 FSAedge
* pEdge
= m_Edges
->front();
345 m_Edges
->pop_front();
346 while(!(pEdge
->GetMorphemes()->empty()))
348 delete pEdge
->GetMorphemes()->front();
349 pEdge
->GetMorphemes()->pop_front();
354 while(!m_States
->empty())
356 FSAstate
* pState
= m_States
->front();
357 m_States
->pop_front();
358 while(!pState
->m_PathList
.empty())
360 delete pState
->m_PathList
.front();
361 pState
->m_PathList
.pop_front();
366 QMap
<QString
, FSAMorpheme
*>::iterator i
= m_morphemes
.begin();
367 while (i
!= m_morphemes
.end()) {
372 delete this->m_States
;
373 delete this->m_Edges
;
374 delete this->m_StartStates
;
377 void FSA::AddSignature(CSignature
* pSig
)
379 //std::cout << "---- Signature " <<pSig->GetNumberOfStems()<<" "<< pSig->GetNumberOfAffixes()<< std::endl;
382 FSAstate
* pStart
= new FSAstate(this);
383 pStart
->setMaxDepth(0);
384 this->m_StartStates
->push_back(pStart
);
386 FSAstate
* pMiddle
= new FSAstate(this);
387 FSAstate
* pEnd
= new FSAstate(this);
390 CStringSurrogate ssStem
;
391 FSAedge
* pEdge1
= new FSAedge(this, pStart
, pMiddle
);
392 for (int i
= 0; i
< pSig
->GetNumberOfStems(); i
++)
394 CStem
* pStem
= pSig
->GetStemPtrList()->at(i
);
395 QString str
= pStem
->Display();
397 int corpusCount
= pStem
->GetCorpusCount();
398 Q_ASSERT(corpusCount
>0);
401 //std::cout << "\tStem:"<<str.toStdString() <<"."<< corpusCount <<std::endl;
402 pEdge1
->AddMorpheme( str
,corpusCount
); //TODO
406 FSAedge
* pEdge2
= new FSAedge(this, pMiddle
, pEnd
);
407 for (int i
= 0; i
< pSig
->GetNumberOfAffixes(); i
++)
409 CSuffix
* pSuffix
= pSig
->GetSuffixPtrList()->at(i
);
410 QString suffixStr
= pSuffix
->Display();
413 //for(int j=0;j<pEdge1->GetMorphemes()->size();j++)
414 FSAMorphemeList::iterator j_iter
= pEdge1
->GetMorphemes()->begin();
415 while(j_iter
!= pEdge1
->GetMorphemes()->end())
417 FSAMorpheme
*pMorph
= *j_iter
++;
418 int stemCount
= pMorph
->m_corpusCount
;
419 Q_ASSERT(stemCount
>0);
420 QString analyzedWord
= pMorph
->toStr();
421 if (suffixStr
!="NULL")
422 analyzedWord
.append(suffixStr
);
424 CWordCollection
* wc
= this->m_lexicon
->GetWords();
426 CStem
* pStem
= ( *wc
^=analyzedWord
);
429 corpusCount
+=(pStem
->GetCorpusCount());
432 if(suffixStr
.size()>0)
434 //std::cout << "\t\tSuffix "<< suffixStr.toStdString() <<"."<<corpusCount<<std::endl;
435 pEdge2
->AddMorpheme(suffixStr
, corpusCount
);
439 //----------------------------------------------------------------//
442 void FSA::AddState(FSAstate
* pState
)
444 m_States
->push_back(pState
);
447 void FSA::AddEdge(FSAedge
* pEdge
)
449 m_Edges
->push_back(pEdge
);
450 pEdge
->GetFromState()->AddEdgeOut(pEdge
);
451 pEdge
->GetToState()->AddEdgeIn(pEdge
);
454 namespace linguistica
{
455 namespace fsa_helper
{
457 struct contains_edge
: std::unary_function
<const FSAedgeList
*, bool> {
459 contains_edge(FSAedge
* edge
) : e(edge
) { }
460 bool operator()(const FSAedgeList
* l
) const
462 return std::find(l
->begin(), l
->end(), e
) != l
->end();
468 void FSA::RemoveEdge(FSAedge
* pEdge
)
472 // remove all morphemes from edge, this does bookkeeping at FSA level
473 // currently unused, TestFunc only removes edge when Morph count is zero
474 if (pEdge
->GetMorphemes()->size() > 0)
476 FSAMorphemeList::iterator iter
= pEdge
->GetMorphemes()->begin();
477 while(iter
!= pEdge
->GetMorphemes()->end())
478 pEdge
->RemoveMorpheme(iter
++);
481 this->m_Edges
->remove(pEdge
);
482 pEdge
->GetFromState()->m_EdgesOut
.remove(pEdge
);
483 pEdge
->GetToState()->m_EdgesIn
.remove(pEdge
);
485 //remove paths that contain this edge
486 // currently not used -- paths will have been cleared by ResetDisplay
487 const linguistica::fsa_helper::contains_edge
contains_pEdge(pEdge
);
489 FSApathList::iterator path_it
= this->m_FSAPathList
.begin();
490 const FSApathList::iterator paths_end
= this->m_FSAPathList
.end();
492 while ((path_it
= find_if(path_it
,paths_end
,contains_pEdge
)) != paths_end
)
494 FSAedgeList
* const pPath
= *path_it
;
495 path_it
= this->m_FSAPathList
.erase(path_it
);
498 // end -- remove paths
503 void FSA::FSATestFunc()
505 CSuffixCollection
* lexSuffixes
= this->m_lexicon
->GetSuffixes();
506 lexSuffixes
->Sort(COUNT
);
507 CStemCollection
* lexStems
= this->m_lexicon
->GetStems();
509 for (int suf_idx
=0; suf_idx
<lexSuffixes
->GetCount(); ++suf_idx
)
511 if(suf_idx
>=TEST_FUNC_MAX
) {
512 //std::cout << "BREAK" << std::endl;
515 CSuffix
* pSuffix
= lexSuffixes
->GetAtSort(suf_idx
);
516 //std::cout << "Suffix:" << pSuffix->Display().toStdString() << std::endl;
517 QString suffixStr
= pSuffix
->Display();
519 if(suffixStr
== "NULL") continue;
521 //make copy now, list will be altered in while-loop
522 FSAedgeList edges
= FSAedgeList(*this->m_Edges
);
524 FSAedgeList::iterator edge_it
= edges
.begin();
525 while(edge_it
!= edges
.end())
527 FSAedge
* pEdge
= *edge_it
;
528 ++edge_it
; //increment now, pEdge may be deleted
530 FSAMorphemeList morphemesToReanalyze
;
531 FSAMorphemeList
* morphemes
= pEdge
->GetMorphemes();
532 //std::cout << "At edge: " << join(*morphemes,QString(' ')).toStdString() << std::endl;
533 foreach (FSAMorpheme
* pMorph
, *morphemes
)
535 QString edgeStr
= pMorph
->toStr();
536 //std::cout<<" Morpheme:"<<edgeStr.toStdString()<<std::endl;
538 if (edgeStr
.endsWith(suffixStr
) && edgeStr
!=suffixStr
)
540 QString stemStr
= edgeStr
.left(edgeStr
.length() - suffixStr
.length());
542 CStem
* pStem
= (*lexStems
^=stemStr
);
543 //if stemStr in Lexicon
546 //if stem takes this suffix
547 CSignature
* pSig
= pStem
->GetSuffixSignature();
548 for (int k
= 0;k
<pSig
->GetSuffixPtrList()->count();k
++)
550 CSuffix
* pSuffix
= pSig
->GetSuffixPtrList()->at(k
);
551 if(pSuffix
->Display() == suffixStr
)
553 morphemesToReanalyze
.push_back(pMorph
);
558 } //else skip this morpheme
560 if(!morphemesToReanalyze
.empty()) {
561 if( morphemes
->size()==morphemesToReanalyze
.size() ) {
562 //just do series split
563 this->DoSeriesSplit(pEdge
,suffixStr
.length(),true);
565 FSAedge
* pNewEdge
= this->DoParallelSplit(pEdge
,morphemesToReanalyze
);
566 this->DoSeriesSplit(pNewEdge
,suffixStr
.length(),true);
573 void FSA::AddPrefixes(CMiniLexicon
* pMiniLexicon
) //Should be Prefix MiniLex
575 CPrefixCollection
* lexPrefixes
= pMiniLexicon
->GetPrefixes();
576 Q_ASSERT(lexPrefixes
);
577 lexPrefixes
->Sort(COUNT
);
578 //CStemCollection* lexStems = this->m_lexicon->GetStems();
579 CStemCollection
* lexStems
= pMiniLexicon
->GetStems();
581 for (int suf_idx
=0; suf_idx
<lexPrefixes
->GetCount(); ++suf_idx
)
583 //if(suf_idx>=TEST_FUNC_MAX) { //Not used for prefixes
584 //std::cout << "BREAK" << std::endl;
588 CPrefix
* pPrefix
= lexPrefixes
->GetAtSort(suf_idx
);
589 std::cout
<< "Prefix:" << pPrefix
->Display().toStdString() << std::endl
;
590 QString prefixStr
= pPrefix
->Display();
592 if(prefixStr
== "NULL") continue;
594 //make copy now, list will be altered in while-loop
595 FSAedgeList edges
= FSAedgeList(*this->m_Edges
);
597 FSAedgeList::iterator edge_it
= edges
.begin();
598 while(edge_it
!= edges
.end())
600 FSAedge
* pEdge
= *edge_it
;
601 ++edge_it
; //increment now, pEdge may be deleted
603 FSAMorphemeList morphemesToReanalyze
;
604 FSAMorphemeList
* morphemes
= pEdge
->GetMorphemes();
605 //std::cout << "At edge: " << join(*morphemes,QString(' ')).toStdString() << std::endl;
606 foreach (FSAMorpheme
* pMorph
, *morphemes
)
608 QString edgeStr
= pMorph
->toStr();
609 //std::cout<<" Morpheme:"<<edgeStr.toStdString()<<std::endl;
611 if (edgeStr
.startsWith(prefixStr
) && edgeStr
!=prefixStr
)
613 QString stemStr
= edgeStr
.right(edgeStr
.length() - prefixStr
.length());
614 std::cout
<<" Morpheme:"<<edgeStr
.toStdString()<<std::endl
;
616 CStem
* pStem
= (*lexStems
^=stemStr
);
617 //if stemStr in Lexicon
619 if(pStem
&& pStem
->GetPrefixSignature())
621 //if stem takes this prefix
622 CSignature
* pSig
= pStem
->GetPrefixSignature();
623 Q_ASSERT(pSig
->GetPrefixPtrList()->count());
624 for (int k
= 0;k
<pSig
->GetPrefixPtrList()->count();k
++)
626 CPrefix
* pPrefix
= pSig
->GetPrefixPtrList()->at(k
);
627 if(pPrefix
->Display() == prefixStr
)
629 morphemesToReanalyze
.push_back(pMorph
);
635 } //else skip this morpheme
638 if(!morphemesToReanalyze
.empty()) {
639 if( morphemes
->size()==morphemesToReanalyze
.size() ) {
640 //just do series split
641 this->DoSeriesSplit(pEdge
,prefixStr
.length(),false);
643 FSAedge
* pNewEdge
= this->DoParallelSplit(pEdge
,morphemesToReanalyze
);
644 this->DoSeriesSplit(pNewEdge
,prefixStr
.length(),false);
656 this->m_searched
= true;
658 // get 'depth' of FSA
659 this-> m_MaxPathLength
=0;
661 //queue.append(this->m_StartState);
662 for ( FSAStateList::iterator i
=this->m_StartStates
->begin();
663 i
!= this->m_StartStates
->end(); ++i
)
665 FSAstate
* pStartState
=*i
; //this->m_StartStates->at(i);
668 FSAedgeList
* pPath
= new FSAedgeList();
669 pStartState
->addPath(pPath
);
671 queue
.push_back(pStartState
);
673 while(!queue
.empty())
675 //FSAstate* pState = queue.takeFirst();
676 FSAstate
* pState
= queue
.front();
679 FSAedgeList
* edgeList
= pState
->GetEdgesOut();
681 //if at leaf, add paths to FSA and do not search outbound edges
682 if(edgeList
->size() == 0)
684 //this->m_FSAPathList += *(pState->GetPathList());
685 for(FSApathList::iterator j
=pState
->GetPathList()->begin();
686 j
!= pState
->GetPathList()->end(); ++j
)
687 this->m_FSAPathList
.push_back(*j
);
691 //iter over outbound edges
692 for (FSAedgeList::iterator j
=edgeList
->begin(); j
!=edgeList
->end(); ++j
)
695 FSAstate
* pChildState
= pEdge
->GetToState();
696 pChildState
->m_DiscoverCount
++;
698 //add paths to pChildState
699 for(FSApathList::iterator l
=pState
->GetPathList()->begin();
700 l
!= pState
->GetPathList()->end(); ++l
)
702 FSAedgeList
* pPath
= *l
;
704 FSAedgeList
* pNewPath
= new FSAedgeList(*pPath
);
705 pNewPath
->push_back(pEdge
);
706 pChildState
->addPath(pNewPath
);
709 int maxChildDepth
= pChildState
->getMaxDepth();
711 //set length of max path to child
712 if(maxChildDepth
==-1 || pState
->getMaxDepth()+1 > maxChildDepth
)
714 pChildState
->setMaxDepth(pState
->getMaxDepth()+1);
715 maxChildDepth
= pChildState
->getMaxDepth();
718 if (maxChildDepth
> m_MaxPathLength
)
720 m_MaxPathLength
=maxChildDepth
;
723 Q_ASSERT(pChildState
->m_DiscoverCount
<= pChildState
->GetEdgesIn()->size());
725 if(pChildState
->m_DiscoverCount
== pChildState
->GetEdgesIn()->size())
726 queue
.push_back(pChildState
);
731 std::cout
<< "FSA DL:"<<this->ComputeDL() << std::endl
;
734 int FSA::GetRobustness(FSAedgeList
* pPath
)
736 QStringList pathWordsWorking
;
737 int pathCharCount
= 0;
738 FSAMorphemeList
* pEdge0Morphemes
= pPath
->front()->GetMorphemes();
739 FSAMorphemeList::iterator j_iter
= pEdge0Morphemes
->begin();
740 //for(int j=0;j<pEdge0Morphemes->size();j++)
741 while(j_iter
!=pEdge0Morphemes
->end())
743 FSAMorpheme
* pMorph
= *j_iter
++;
744 pathCharCount
+=pMorph
->toStr().length();
745 pathWordsWorking
.push_back(pMorph
->toStr());
748 FSAedgeList::iterator i
= pPath
->begin();
749 ++i
; //skip first edge, already got these morphemes (above)
750 while( i
!= pPath
->end() )
752 FSAedge
* pEdge
= *i
++;
753 QStringList newWords
;
754 FSAMorphemeList
* mList
= pEdge
->GetMorphemes();
755 FSAMorphemeList::iterator j_iter
= mList
->begin();
756 //for(int j=0;j<mList->size();j++)
757 while (j_iter
!=mList
->end())
759 QString morph
= (*j_iter
++)->toStr();
760 //std::cout<< "At morpheme:"<< morph.toStdString() <<std::endl;
761 pathCharCount
+= morph
.length();
762 for(int k
=0;k
<pathWordsWorking
.size();k
++)
764 QString newWord
= pathWordsWorking
.at(k
) + morph
;
765 //std::cout<< " created: "<< newWord.toStdString() <<std::endl;
766 newWords
.append(newWord
);
769 pathWordsWorking
.clear();
770 for(int j
=0;j
<newWords
.size();j
++)
771 pathWordsWorking
.append(newWords
.at(j
));
774 int charsReplacedCount
= 0;
775 for(int i
=0;i
<pathWordsWorking
.size();i
++)
777 charsReplacedCount
+=pathWordsWorking
.at(i
).length();
780 //std::cout<< " charsReplaced: "<< charsReplacedCount << " pathCharCount: "<< pathCharCount << std::endl;
781 return charsReplacedCount
-pathCharCount
;
784 void FSAstate::OutputXfst (QTextStream
& outf
)
786 FSAstate
* pState
=this;
787 if ( pState
->GetEdgesIn()->size() == 0)//start state
789 outf
<< "\ndefine "<< this->getStateName() << " 0; "<<endl
;
793 outf
<< "define " << this->getStateName() << " ";
794 if (pState
->GetEdgesIn()->size()>1) outf
<< " [ ";
796 for (FSAedgeList::iterator j_iter
=pState
->GetEdgesIn()->begin();
797 j_iter
!= pState
->GetEdgesIn()->end(); ++j_iter
)
799 if (j_iter
!= pState
->GetEdgesIn()->begin())
801 FSAedge
* pEdge
= *j_iter
;
802 outf
<< pEdge
->GetFromState()->getStateName();
804 if (pEdge
->GetMorphemes()->size()==1)
806 FSAMorpheme
* pm
= pEdge
->GetMorphemes()->front();
807 const QString ss
= pm
->toStr();
808 if (pm
->toStr()=="NULL")
811 outf
<< " {" << ss
.toAscii() << "} ";
815 //outf << QString("S%1 ").arg(pEdge->GetFromState()->m_stateId);
817 FSAMorphemeList::iterator k_iter
= pEdge
->GetMorphemes()->begin();
818 //for(int k=0;k<pEdge->GetMorphemes()->size();k++)
819 while(k_iter
!= pEdge
->GetMorphemes()->end())
821 if (k_iter
!=pEdge
->GetMorphemes()->begin()) outf
<< "|";
822 FSAMorpheme
* pm
= *k_iter
++;
823 const QString ss
= pm
->toStr();
824 if (pm
->toStr()=="NULL")
827 outf
<< " {" << ss
.toAscii() << "} ";
832 if (pState
->GetEdgesIn()->size()>1) outf
<< " ]";
837 void FSAstate::SearchEdgeXfst(QTextStream
& outf
, std::set
<FSAstate
*>& discovered
)
839 for(FSAedgeList::iterator i
=this->GetEdgesIn()->begin();
840 i
!= this->GetEdgesIn()->end(); i
++)
842 FSAstate
* pStateIn
= (*i
)->GetFromState();
843 if(discovered
.count(pStateIn
)==0) //if state not discovered yet
844 pStateIn
->SearchEdgeXfst(outf
,discovered
);
846 this->OutputXfst(outf
);
849 void FSA::OutputFSAXfst ( const QString
& filename
)
851 QFile
file( filename
);
853 FSAStateList topoSortedStates
;
854 std::set
<FSAstate
*> discovered
;
856 //find states with no outbound edges
857 FSAStateList endStates
;
858 for (FSAStateList::iterator i
=this->GetStates()->begin();
859 i
!= this->GetStates()->end();++i
)
861 if ((*i
)->GetEdgesOut()->size()==0)
862 endStates
.push_back(*i
);
866 if( file
.open( IO_WriteOnly
) )
868 QTextStream
outf( &file
); //Should be ascii file, not unicode
870 outf
<< "# " << endl
;
871 outf
<< "# File: " << filename
<< endl
;
872 outf
<< "# " << endl
;
874 //for (int i=0;i<this->GetStates()->size();i++)
875 for (FSAStateList::iterator i
= endStates
.begin();
876 i
!=endStates
.end(); ++i
)
878 (*i
)->SearchEdgeXfst(outf
,discovered
);
882 //outf << "union net" << endl << endl;
883 //outf << "print words" << endl << endl;
884 for (FSAStateList::iterator i
= endStates
.begin();
885 i
!=endStates
.end(); ++i
)
887 FSAstate
* pState
= *i
;
889 outf
<< "push " << pState
->getStateName() << endl
;
890 outf
<< "print words" << endl
<< endl
;
891 outf
<< "pop stack" << endl
<< endl
;
899 void FSA::FSAListDisplay(Q3ListView
* pView
,
900 QMap
<class QString
, class QString
>* /* out filter not supported yet */,
901 bool /* “analyzed words only” flag for future use */)
903 pView
->setRootIsDecorated( FALSE
);
904 pView
->setSorting(1);
905 // Remove all previous columns
906 while( pView
->columns() ) pView
->removeColumn( 0 );
909 // get 'depth' of FSA
913 pView
->setSorting(0); //sort on 0th column
915 // Add Column headers
916 //std::cout<<"Graph Max: "<<m_MaxPathLength <<std::endl;
918 for (int i
= 0;i
<this->m_MaxPathLength
;i
++)
920 pView
->addColumn( QString( "Edge %1" ).arg(i
));
922 pView
->addColumn( QString( "Robustness" ));
923 //m_pLexicon->GetDocument()->setStatusBar1( "Displaying FSA" );
925 //associate each start state with it's parent list view item (row).
926 // If start state only has one outbound path then there will
927 // be only one lv item, otherwise lv items/paths will be associated
928 // with a placeholder parent path
929 QMap
<FSAstate
*,FSAListViewItem
*> lvMap
;
930 //create placeholder lvItem for start states with >=1 path
931 for(FSApathList::iterator i
=this->m_FSAPathList
.begin();
932 i
!= this->m_FSAPathList
.end(); ++i
)
934 FSAedgeList
* pPath
= *i
;
935 FSAstate
* pStartState
= pPath
->front()->GetFromState();
936 if(!lvMap
.contains(pStartState
) && pStartState
->GetEdgesOut()->size()>1)
938 FSAListViewItem
* lvtop
= new FSAListViewItem(pView
,pPath
);
939 lvtop
->setVisible(TRUE
);
940 lvtop
->setOpen(TRUE
);
941 lvtop
->setText(0,"All paths: ");
942 lvMap
.insert(pStartState
,lvtop
);
946 //store totaled robustness values for subgraph starting at each start state
947 QMap
<FSAstate
*,int> robustnessMap
;
949 //Add Item for each start state
951 for (int i=0;i<this->m_StartStates->size();i++)
953 FSAstate* pStartState = m_StartStates->at(i);
955 FSAListViewItem * pStartStateItem= new FSAListViewItem(pView, this->m_FSAPathList.at(0)); //LEAK
956 pStartStateItem->setVisible(TRUE);
957 pStartStateItem->setOpen(TRUE);
959 lvMap.insert(pStartState,pStartStateItem);
962 //add every path as row to list view
963 for(FSApathList::iterator i
=this->m_FSAPathList
.begin();
964 i
!= this->m_FSAPathList
.end(); ++i
)
966 FSAedgeList
* pPath
= *i
;
968 int robustness
= FSA::GetRobustness(pPath
);
969 QString robustnessStr
= QString("%1").arg(robustness
,16);
971 FSAListViewItem
*lvItem
;
972 //get parent list view item
973 FSAstate
* pStartState
=pPath
->front()->GetFromState();
974 FSAListViewItem
* pParent
=lvMap
.value(pStartState
);
977 lvItem
= new FSAListViewItem(pView
, pPath
);
978 lvMap
.insert(pStartState
,lvItem
);
982 lvItem
= new FSAListViewItem(pParent
,pPath
);
984 int oldVal
= robustnessMap
.value(pStartState
);
985 robustnessMap
.insert(pStartState
,robustness
+oldVal
); //score of lvItem,pPath
988 lvItem
->setVisible(TRUE
);
989 lvItem
->setOpen(TRUE
);
993 for(FSAedgeList::iterator i
=pPath
->begin(); i
!=pPath
->end(); ++i
)
996 FSAMorphemeList
* labelList
= pEdge
->GetMorphemes();
997 QString label
= join(*labelList
, QString(' '));
998 if(label
.length() > 64)
999 label
= QString("%1 ... %2 (%3)")
1000 .arg(labelList
->front()->toStr())
1001 .arg(labelList
->back()->toStr())
1002 .arg(labelList
->size());
1005 lvItem
->setText(col_idx
,label
+" ");
1007 //if this edge spans multiple columns,
1008 //then columns / use whitespace
1009 col_idx
= pEdge
->GetToState()->getMaxDepth();
1012 //FSAedgeList * edgeList = pEdge->GetToState()->GetEdgesOut();
1013 //pEdge = ((edgeList->isEmpty())? NULL : edgeList->first());
1015 lvItem
->setText(m_MaxPathLength
,robustnessStr
);
1018 QMap
<FSAstate
*,FSAListViewItem
*>::iterator lvm_it
= lvMap
.begin();
1019 while (lvm_it
!= lvMap
.end())
1021 FSAstate
* pState
= lvm_it
.key();
1022 if(pState
->GetEdgesOut()->size() > 1)
1024 int robVal
= robustnessMap
.value(pState
);
1025 QString robustnessStr
= QString("%1").arg(robVal
,16);
1026 lvm_it
.value()->setText(m_MaxPathLength
,robustnessStr
);
1033 //--------------------------------------------------------//
1035 FSAListViewItem::~FSAListViewItem()
1037 if(m_pImage
) delete m_pImage
;
1040 QPixmap
* FSAListViewItem::GetQPixmap()
1049 void FSAListViewItem::build_pixmap()
1052 const char outputName
[] = "tmp.png";
1053 QMap
<FSAstate
*,Agnode_t
*> nodeMap
;
1055 // generate new image
1056 GVC_t
* gvc
= gvContext();
1057 graph_t
* g
= agopen(const_cast<char*>("tmpgraph"), AGDIGRAPHSTRICT
);
1059 // left-to-right orientation
1060 agraphattr(g
, const_cast<char*>("rankdir"), const_cast<char*>(""));
1061 agset(g
, const_cast<char*>("rankdir"), const_cast<char*>("LR"));
1063 /*QString& start_state_name = this->GetLVStartState()->m_stateLabel;
1064 if (start_state_name.isEmpty())
1065 //start_state_name = "S";
1066 start_state_name = this->GetLVStartState()->getStateName();*/
1068 // create new Agnode_t for start state
1069 nodeMap
.insert(this->GetLVStartState(),
1070 agnode(g
, this->GetLVStartState()->getStateName().toUtf8().data()));
1072 QList
<FSAstate
*> queue
;
1073 queue
.append(this->GetLVStartState());
1077 while (!queue
.isEmpty()) {
1078 FSAstate
* pCurState
= queue
.takeFirst();
1080 FSAedgeList
& edges
= *pCurState
->GetEdgesOut();
1081 for (FSAedgeList::iterator i
= edges
.begin(); i
!= edges
.end(); ++i
) {
1082 FSAedge
* pEdge
= *i
;
1083 FSAstate
* pNextState
= pEdge
->GetToState();
1085 //label node, if does not already have a label
1086 /*if (pNextState->m_stateLabel.isEmpty())
1088 //QString("N%1").arg(j++);
1090 pNextState->m_stateLabel = QString("S%1").arg(pNextState->m_stateId);
1093 if (!nodeMap
.contains(pNextState
)) {
1094 //QString label_text = pNextState->m_stateLabel;
1095 QString label_text
= pNextState
->getStateName();
1096 nodeMap
.insert(pNextState
,
1097 agnode(g
, label_text
.toUtf8().data()));
1098 queue
.append(pNextState
);
1102 FSAMorphemeList
* labelList
= pEdge
->GetMorphemes();
1103 QString label
= join(*labelList
, QString(' '));
1104 if(label
.length() > 64)
1105 label
= QString("%1 ... %2 (%3)")
1106 .arg(labelList
->front()->toStr())
1107 .arg(labelList
->back()->toStr())
1108 .arg(labelList
->size());
1111 Agedge_t
* e1
= agedge(g
, nodeMap
.value(pCurState
),
1112 nodeMap
.value(pNextState
));
1113 agedgeattr(g
, const_cast<char*>("label"),
1114 const_cast<char*>(""));
1115 agset(e1
, const_cast<char*>("label"),
1116 const_cast<char*>(label
.toUtf8().data()));
1120 // start state has >=2 outbound paths
1121 GetLVStartState()->GetEdgesOut()->size() >= 2 &&
1123 std::find(this->m_pPath
->begin(),this->m_pPath
->end(),pEdge
)
1124 != this->m_pPath
->end()
1127 agedgeattr(g
, const_cast<char*>("style"),
1128 const_cast<char*>(""));
1129 agset(e1
, const_cast<char*>("style"),
1130 const_cast<char*>("bold"));
1135 gvLayout(gvc
, g
, const_cast<char*>("dot"));
1136 gvRenderFilename(gvc
, g
, const_cast<char*>("png"),
1137 const_cast<char*>(outputName
));
1139 // cleanup - free Layout
1140 gvFreeLayout(gvc
, g
);
1145 // close output file and free context
1150 QPixmap
* pp
= new QPixmap();//"dog.jpg");//QPixmap*uoutputName);
1151 //std::cout<<" Null:"<<(pp->isNull()?"Yes":"No")<<std::endl;
1157 void FSAListViewItem::DisplayEdgePath(Q3TextEdit
* m_commandLine
)
1160 FSAedge
* pEdge
=this->m_pPath
->front();//GetEdge();
1161 int tablength
= 0; //FIX
1163 m_commandLine
->insert( QString( "Edge %1:\n\n" ).arg(i
++) );
1164 FSAMorphemeList
* morphemes
= pEdge
->GetMorphemes();
1168 FSAMorphemeList::iterator j_iter
= morphemes
->begin();
1169 //for (int j = 0; j < morphemes->size(); )
1170 while(j_iter
!=morphemes
->end())
1172 //read up to 5 morphemes and print to line
1173 for( int k
=0; k
< nColumns
&& j_iter
!= morphemes
->end(); k
++ )
1175 //QString text = morphemes->at(j++)->toStr();//.stripWhiteSpace();
1176 QString text
= (*j_iter
++)->toStr();//.stripWhiteSpace();
1177 m_commandLine
->insert( text
+ "\t" );
1179 // Make tab length depend on the longest
1180 // word and font size
1181 int wordlength
= text
.length() * m_commandLine
->pointSize();
1182 if( tablength
< wordlength
) tablength
= wordlength
;
1184 m_commandLine
->insert( "\n" );
1186 m_commandLine
->insert( "\n" );
1188 //m_commandLine->insert(join(*morphemes, QString(' ')));
1189 FSAedgeList
* edgeList
= pEdge
->GetToState()->GetEdgesOut();
1190 if(!edgeList
->empty())
1191 pEdge
=edgeList
->front();
1194 m_commandLine
->insert("\n\n");
1196 m_commandLine
->setTabStopWidth( tablength
);
1200 void FSA::ResetDisplay()
1202 this->m_searched
=false;
1203 this->m_FSAPathList
.clear();
1205 for (FSAStateList::iterator i
=this->GetStates()->begin();
1206 i
!= this->GetStates()->end(); ++i
)
1208 FSAstate
* pState
= *i
;
1209 pState
->GetPathList()->clear();
1211 pState
->m_DiscoverCount
=0;
1212 pState
->m_stateLabel
=QString("");
1218 double FSA::ComputeDL()
1220 double morphemesDL
=0;
1222 QMap
<QString
,FSAMorpheme
*>::iterator m_it
;
1223 int mSum
=0; // sum morphCounter
1224 for (m_it
= this->m_morphemes
.begin(); m_it
!=m_morphemes
.end(); m_it
++)
1225 mSum
+= m_it
.value()->m_corpusCount
;
1227 QMap
<QString
,double> morphPtrDLMap
;
1228 for (m_it
= this->m_morphemes
.begin(); m_it
!=m_morphemes
.end(); m_it
++)
1230 FSAMorpheme
* pMorph
= m_it
.value();
1232 morphemesDL
+= pMorph
->GetDL(this->m_charCount
);
1234 //lenght of pointer to morpheme
1235 double cnt
= pMorph
->m_corpusCount
; // morphCounter.value(mi.key());
1236 double pDL
= base2log(mSum
/cnt
);
1237 //std::cout << mi.key().toStdString() << " \tDL: " << pDL << std::endl;
1238 morphPtrDLMap
.insert(pMorph
->toStr(), pDL
);
1241 std::cout
<< "morphemes DL: " << morphemesDL
<< std::endl
;
1243 double allStatesDL
=0;
1245 QMap
<FSAedge
*,double> edgePtrDLMap
;
1246 for (FSAStateList::iterator i
= this->m_States
->begin();
1247 i
!= this->m_States
->end(); ++i
)
1249 FSAstate
* pState
= *i
;
1251 for (FSAedgeList::iterator j_iter
=pState
->m_EdgesOut
.begin();
1252 j_iter
!= pState
->m_EdgesOut
.end(); ++j_iter
)
1254 FSAedge
* pEdge
= *j_iter
;
1255 int edgeWordCount
= 0;
1257 for(FSAMorphemeList::iterator k_iter
= pEdge
->GetMorphemes()->begin();
1258 k_iter
!= pEdge
->GetMorphemes()->end();
1261 FSAMorpheme
* pMorph
= *k_iter
;
1262 const QString
& morphStr
= pMorph
->toStr();
1263 edgeWordCount
+= pMorph
->m_corpusCount
;
1264 stateDL
+= morphPtrDLMap
[morphStr
];
1267 double edgePtrDL
= base2log(mSum
/edgeWordCount
);
1269 edgePtrDLMap
.insert(pEdge
,edgePtrDL
);
1271 allStatesDL
+=stateDL
;
1273 std::cout
<< "states DL: " << allStatesDL
<< std::endl
;
1275 return morphemesDL
+ allStatesDL
;
1278 /* This vn based on constructor, not used, using other vn based on TestFunc
1280 void FSA::AddPrefixes(CMiniLexicon* pMiniLexicon)
1282 //m_StartState->setMaxDepth(0);
1283 // this->AddState(m_StartState);
1285 //int MinimumNumberOfStemsForDisplay = 2;
1286 //if (pMiniLexicon->GetSignatures()->GetCount() < 20 ) MinimumNumberOfStemsForDisplay = 1;
1287 //int MinimumNumberOfAffixesForDisplay = 1; //make this adjustable by user @@@@
1289 for (int i = 0; i < pMiniLexicon->GetSignatures()->GetCount(); i++)
1291 CSignature* pSig = pMiniLexicon->GetSignatures()->GetAt(i);
1293 //if (pSig->GetNumberOfStems() < MinimumNumberOfStemsForDisplay ) continue;
1294 //if (pSig->Size() < MinimumNumberOfAffixesForDisplay ) continue;
1295 AddPrefixSignature(pSig);
1301 void FSA::AddPrefixSignature(CSignature* pSig)
1303 CStringSurrogate ssStem;
1304 //FSAedge* pEdge1 = new FSAedge(this, pStart, pMiddle);
1305 for (int i = 0; i < pSig->GetNumberOfStems(); i++)
1307 CStem* pStem = pSig->GetStemPtrList()->at(i);
1308 QString stemStr = pStem->Display();
1310 int corpusCount = pStem->GetCorpusCount();
1311 Q_ASSERT(corpusCount>0);
1312 if(stemStr.size()>0)
1314 std::cout << "\tStem:"<<stemStr.toStdString() <<"."<< corpusCount <<std::endl;
1315 //pEdge1->AddMorpheme( stmStrr,corpusCount); //TODO
1316 Q_ASSERT(pSig->GetNumberOfAffixes());
1317 for (int j = 0; j < pSig->GetNumberOfAffixes(); j++)
1319 CPrefix* pPrefix= pSig->GetPrefixPtrList()->at(j);
1320 QString prefixStr = pPrefix->Display();
1321 if(prefixStr=="NULL") continue; //skip for now
1323 QString analyzedWord = (prefixStr == "NULL" ? "":prefixStr)+stemStr;
1324 std::cout << "\t\tWord:"<<analyzedWord.toStdString() <<"."<< corpusCount <<std::endl;
1325 //QList<FSAedge*> edges= this->m_morphemeEdges.values(analyzedWord);
1326 foreach(FSAedge * pEdge,edges)
1328 FSAMorphemeList * pEdgeMorphemes = pEdge->GetMorphemes();
1329 FSAMorphemeList::iterator m_it = pEdgeMorphemes->begin();
1330 while(m_it != pEdge->GetMorphemes()->end())
1332 if((*m_it)->toStr() == analyzedWord) break;
1335 FSAMorphemeList morphemesToReanalyze;
1336 morphemesToReanalyze.push_back(*m_it);
1337 FSAedge* pNewEdge = this->DoParallelSplit(pEdge,morphemesToReanalyze);
1338 this->DoSeriesSplit(pNewEdge,prefixStr.length(),false);