1 // Implementation of CSequencer methods
2 // Copyright © 2009 The University of Chicago
5 #include <Q3FileDialog>
6 #include <QInputDialog>
9 #include <Q3TextStream>
14 #include "ui/Status.h"
17 CSequencer::CSequencer()
20 m_maxlineintrain(2000),
31 m_totalbigramsbase(0),
32 m_totaltrigramsbase(0) { }
34 void CSequencer::readCorpus(linguistica::ui::status_user_agent
& status
)
36 QString sequenceTrainFileName
;
38 QString firstWord
, secondWord
, thirdWord
;
39 QString leftWord
, rightWord
;
40 QString oneBigram
, oneTrigram
;
41 QString oneTrigrambase
;
46 QMap
<QString
, int>::Iterator StringToIntIt
;
47 QMap
<QString
, double>::Iterator StringToDoubleIt
;
49 sequenceTrainFileName
= Q3FileDialog::getOpenFileName(
50 sequenceTrainFileName
,
54 "Choose a train file to open");
56 if (sequenceTrainFileName
.isEmpty())
59 // XXX. These should be bundled into a struct and made a local variable.
61 m_bigramsbase
.clear();
64 m_trigramsbase
.clear();
65 m_trigramprob
.clear();
68 m_totalbigramsbase
= 0;
69 m_totaltrigramsbase
= 0;
71 Q_ASSERT(!sequenceTrainFileName
.isEmpty());
72 QFile
trainFile(sequenceTrainFileName
);
74 if (trainFile
.open( QIODevice::ReadOnly
)) {
75 Q3TextStream
trainStream(&trainFile
);
76 // trainStream.setEncoding(QTextStream::Unicode);
79 while (!trainStream
.atEnd()) {
80 oneLine
= trainStream
.readLine(); // This is one sentence.
82 if (oneLine
.length() == 0)
85 oneLine
= oneLine
.lower();
86 oneLine
= oneLine
.stripWhiteSpace();
87 oneLine
= oneLine
.simplifyWhiteSpace();
94 loc
= oneLine
.find(" ");
96 leftWord
= oneLine
.left(loc
);
97 rightWord
= oneLine
.right(oneLine
.length() - loc
- 1);
99 if (indexOfWord
== 0) {
100 secondWord
= leftWord
;
101 oneBigram
= firstWord
+ separator
+ secondWord
;
102 if (m_bigrams
.contains(oneBigram
))
103 m_bigrams
[oneBigram
]++;
105 m_bigrams
.insert(oneBigram
, 1);
107 if (m_bigramsbase
.contains(firstWord
))
108 m_bigramsbase
[firstWord
]++;
110 m_bigramsbase
.insert(firstWord
, 1);
113 m_totalbigramsbase
++;
115 thirdWord
= leftWord
;
117 oneBigram
= secondWord
+ separator
+ thirdWord
;
118 oneTrigram
= firstWord
+ separator
+ secondWord
+ separator
+ thirdWord
;
119 oneTrigrambase
= firstWord
+ separator
+ secondWord
;
122 if (m_bigrams
.contains(oneBigram
))
123 m_bigrams
[oneBigram
]++;
125 m_bigrams
.insert(oneBigram
, 1);
127 if (m_bigramsbase
.contains(secondWord
))
128 m_bigramsbase
[secondWord
]++;
130 m_bigramsbase
.insert(secondWord
, 1);
133 m_totalbigramsbase
++;
136 if (m_trigrams
.contains(oneTrigram
))
137 m_trigrams
[oneTrigram
]++;
139 m_trigrams
.insert(oneTrigram
, 1);
141 if (m_trigramsbase
.contains(oneTrigrambase
))
142 m_trigramsbase
[oneTrigrambase
]++;
144 m_trigramsbase
.insert(oneTrigrambase
, 1);
146 m_totaltrigramsbase
++;
148 // Move the first,second words
149 firstWord
= secondWord
;
150 secondWord
= thirdWord
;
154 loc
= oneLine
.find(" ");
160 if (leftWord
== QString(".")) {
161 // XXX. report errors to caller instead of aborting
162 // make sure we don't get a line like "."
163 Q_ASSERT(indexOfWord
> 0);
165 thirdWord
= leftWord
;
167 oneBigram
= secondWord
+ separator
+ thirdWord
;
168 oneTrigram
= firstWord
+ separator
+ secondWord
+ separator
+ thirdWord
;
169 oneTrigrambase
= firstWord
+ separator
+ secondWord
;
172 if (m_bigrams
.contains(oneBigram
))
173 m_bigrams
[oneBigram
]++;
175 m_bigrams
.insert(oneBigram
, 1);
177 if (m_bigramsbase
.contains(secondWord
))
178 m_bigramsbase
[secondWord
]++;
180 m_bigramsbase
.insert(secondWord
, 1);
183 m_totalbigramsbase
++;
186 if (m_trigrams
.contains(oneTrigram
))
187 m_trigrams
[oneTrigram
]++;
189 m_trigrams
.insert(oneTrigram
, 1);
191 if (m_trigramsbase
.contains(oneTrigrambase
))
192 m_trigramsbase
[oneTrigrambase
]++;
194 m_trigramsbase
.insert(oneTrigrambase
, 1);
197 m_totaltrigramsbase
++;
199 if (leftWord
.right(1) == QString(".")) {
200 if (indexOfWord
== 0) {
201 secondWord
= leftWord
.left(leftWord
.length() -1);
203 oneBigram
= firstWord
+ separator
+ secondWord
;
204 if (m_bigrams
.contains(oneBigram
))
205 m_bigrams
[oneBigram
]++;
207 m_bigrams
.insert(oneBigram
, 1);
209 if (m_bigramsbase
.contains(firstWord
))
210 m_bigramsbase
[firstWord
]++;
212 m_bigramsbase
.insert(firstWord
, 1);
215 m_totalbigramsbase
++;
217 thirdWord
= leftWord
.left(leftWord
.length() -1);
219 oneBigram
= secondWord
+ separator
+ thirdWord
;
220 oneTrigram
= firstWord
+ separator
+ secondWord
+ separator
+ thirdWord
;
221 oneTrigrambase
= firstWord
+ separator
+ secondWord
;
224 if (m_bigrams
.contains(oneBigram
))
225 m_bigrams
[oneBigram
]++;
227 m_bigrams
.insert(oneBigram
, 1);
229 if (m_bigramsbase
.contains(secondWord
))
230 m_bigramsbase
[secondWord
]++;
232 m_bigramsbase
.insert(secondWord
, 1);
235 m_totalbigramsbase
++;
238 if (m_trigrams
.contains(oneTrigram
))
239 m_trigrams
[oneTrigram
]++;
241 m_trigrams
.insert(oneTrigram
, 1);
243 if (m_trigramsbase
.contains(oneTrigrambase
))
244 m_trigramsbase
[oneTrigrambase
]++;
246 m_trigramsbase
.insert(oneTrigrambase
, 1);
249 m_totaltrigramsbase
++;
251 // Move the first,second words
252 firstWord
= secondWord
;
253 secondWord
= thirdWord
;
256 if (indexOfWord
== 0) {
257 secondWord
= leftWord
;
259 oneBigram
= firstWord
+ separator
+ secondWord
;
260 if (m_bigrams
.contains(oneBigram
))
261 m_bigrams
[oneBigram
]++;
263 m_bigrams
.insert(oneBigram
, 1);
265 if (m_bigramsbase
.contains(firstWord
))
266 m_bigramsbase
[firstWord
]++;
268 m_bigramsbase
.insert(firstWord
, 1);
271 m_totalbigramsbase
++;
273 thirdWord
= leftWord
;
275 oneBigram
= secondWord
+ separator
+ thirdWord
;
276 oneTrigram
= firstWord
+ separator
+ secondWord
+ separator
+ thirdWord
;
277 oneTrigrambase
= firstWord
+ separator
+ secondWord
;
280 if (m_bigrams
.contains(oneBigram
))
281 m_bigrams
[oneBigram
]++;
283 m_bigrams
.insert(oneBigram
, 1);
285 if (m_bigramsbase
.contains(secondWord
))
286 m_bigramsbase
[secondWord
]++;
288 m_bigramsbase
.insert(secondWord
, 1);
291 m_totalbigramsbase
++;
294 if (m_trigrams
.contains(oneTrigram
))
295 m_trigrams
[oneTrigram
]++;
297 m_trigrams
.insert(oneTrigram
, 1);
299 if (m_trigramsbase
.contains(oneTrigrambase
))
300 m_trigramsbase
[oneTrigrambase
]++;
302 m_trigramsbase
.insert(oneTrigrambase
, 1);
305 m_totaltrigramsbase
++;
307 // Move the first,second words
308 firstWord
= secondWord
;
309 secondWord
= thirdWord
;
314 Q_ASSERT(indexOfWord
> 0);
316 thirdWord
= QString(".");
318 oneBigram
= secondWord
+ separator
+ thirdWord
;
319 oneTrigram
= firstWord
+ separator
+ secondWord
+ separator
+ thirdWord
;
320 oneTrigrambase
= firstWord
+ separator
+ secondWord
;
323 if (m_bigrams
.contains(oneBigram
))
324 m_bigrams
[oneBigram
]++;
326 m_bigrams
.insert(oneBigram
, 1);
328 if (m_bigramsbase
.contains(secondWord
))
329 m_bigramsbase
[secondWord
]++;
331 m_bigramsbase
.insert(secondWord
, 1);
334 m_totalbigramsbase
++;
337 if (m_trigrams
.contains(oneTrigram
))
338 m_trigrams
[oneTrigram
]++;
340 m_trigrams
.insert(oneTrigram
, 1);
342 if (m_trigramsbase
.contains(oneTrigrambase
))
343 m_trigramsbase
[oneTrigrambase
]++;
345 m_trigramsbase
.insert(oneTrigrambase
, 1);
348 m_totaltrigramsbase
++;
351 //oneLine = trainStream.readLine(); // This is the "return"
354 status
.major_operation
=
355 QString("read line %1...").arg(numberOfLines
);
357 if (numberOfLines
> m_maxlineintrain
)
363 // Compute the bigram prob and trigram prob
364 double oneBigramProb
;
365 double oneTrigramProb
;
368 int numberOfProcessed
;
370 numberOfProcessed
= 0;
371 for (StringToIntIt
= m_bigrams
.begin(); StringToIntIt
!= m_bigrams
.end(); StringToIntIt
++) {
372 oneBigram
= StringToIntIt
.key();
373 oneNumber
= StringToIntIt
.data();
374 oneBigramProb
= (double) oneNumber
/ m_totalbigrams
;
375 loc
= oneBigram
.find(separator
);
376 oneBase
= oneBigram
.left(loc
);
377 oneBaseProb
= (double) m_bigramsbase
[oneBase
] / m_totalbigramsbase
;
378 oneBigramProb
= -base2log(oneBigramProb
/ oneBaseProb
);
379 m_bigramprob
.insert(oneBigram
, oneBigramProb
);
381 status
.major_operation
=
382 QString("processing bigram %1...")
383 .arg(numberOfProcessed
);
386 numberOfProcessed
= 0;
387 for (StringToIntIt
= m_trigrams
.begin(); StringToIntIt
!= m_trigrams
.end(); StringToIntIt
++) {
388 oneTrigram
= StringToIntIt
.key();
389 oneNumber
= StringToIntIt
.data();
390 oneTrigramProb
= (double) oneNumber
/ m_totaltrigrams
;
391 loc
= oneTrigram
.findRev(separator
);
392 oneBase
= oneTrigram
.left(loc
);
393 oneBaseProb
= (double) m_trigramsbase
[oneBase
] / m_totaltrigramsbase
;
394 oneTrigramProb
= -base2log(oneTrigramProb
/ oneBaseProb
);
395 m_trigramprob
.insert(oneTrigram
, oneTrigramProb
);
397 status
.major_operation
=
398 QString("processing trigram %1...")
399 .arg(numberOfProcessed
);
401 status
.major_operation
.clear();
403 // Debug output bigrams and trigrams
404 QString bigramFileName
= "bigrams.txt";
405 QString trigramFileName
= "trigrams.txt";
408 QFile
bigramFile(bigramFileName
);
409 QFile
trigramFile(trigramFileName
);
411 if (bigramFile
.open(QIODevice::WriteOnly
)) {
412 Q3TextStream
bigramStream(&bigramFile
);
413 for (StringToDoubleIt
= m_bigramprob
.begin(); StringToDoubleIt
!= m_bigramprob
.end(); StringToDoubleIt
++) {
414 oneKey
= StringToDoubleIt
.key();
415 oneValue
=StringToDoubleIt
.data();
416 oneKey
= oneKey
.replace(separator
, " ");
417 bigramStream
<< oneKey
<< " " << oneValue
<< endl
;
421 // XXX. handle error.
424 if (trigramFile
.open(QIODevice::WriteOnly
)) {
425 Q3TextStream
trigramStream(&trigramFile
);
426 for (StringToDoubleIt
= m_trigramprob
.begin(); StringToDoubleIt
!= m_trigramprob
.end(); StringToDoubleIt
++) {
427 oneKey
= StringToDoubleIt
.key();
428 oneValue
=StringToDoubleIt
.data();
429 oneKey
= oneKey
.replace(separator
, " ");
430 trigramStream
<< oneKey
<< " " << oneValue
<< endl
;
434 // XXX. handle error.
439 void CSequencer::sequencerTestAFile(linguistica::ui::status_user_agent
& status
)
441 if (m_totalbigrams
== 0) {
442 QMessageBox::information(NULL
, "Warning", "Please Read Training Corpus Firstly!", "OK");
446 QString sequenceTestFileName
= Q3FileDialog::getOpenFileName(
447 sequenceTestFileName
,
451 "Choose a train file to open");
453 if (sequenceTestFileName
.isEmpty())
456 QFile
testFile(sequenceTestFileName
);
459 int oneBiScore
, oneTriScore
;
460 int totalHitInBigramList
= 0;
461 int totalHitInTrigramList
= 0;
462 int totalSentences
= 0;
463 int totalHitInBigramListSumRanks
= 0;
464 int totalHitInTrigramListSumRanks
= 0;
465 double averageBiRanking
, averageTriRanking
;
467 if (testFile
.open(QIODevice::ReadOnly
)) {
468 Q3TextStream
testStream(&testFile
);
469 // testStream.setEncoding( QTextStream::Unicode );
472 while (!testStream
.atEnd()) {
473 oneLine
= testStream
.readLine(); // This is one sentence.
474 if (oneLine
.length() == 0)
476 oneLine
= oneLine
.lower();
477 oneLine
= oneLine
.stripWhiteSpace();
478 oneLine
= oneLine
.simplifyWhiteSpace();
479 if (oneLine
.length() == 0)
481 sequenceASentence(oneBiScore
, oneTriScore
, oneLine
);
483 if (oneBiScore
!= 0) {
484 totalHitInBigramListSumRanks
+= oneBiScore
;
485 totalHitInBigramList
++;
487 if (oneTriScore
!= 0) {
488 totalHitInTrigramListSumRanks
+= oneTriScore
;
489 totalHitInTrigramList
++;
492 status
.major_operation
=
493 QString("testing sentence %1...")
495 if (numberOfLines
> m_maxlineintest
)
498 status
.major_operation
.clear();
501 averageBiRanking
= (double)totalHitInBigramListSumRanks
/ totalHitInBigramList
;
502 averageTriRanking
= (double)totalHitInTrigramListSumRanks
/ totalHitInTrigramList
;
504 QMessageBox::information(NULL
,
506 QString("Total %1, Bi-Hit %2, Tri-Hit %3, RankingInBiHit %4, RankingInTriHit %5")
508 .arg(totalHitInBigramList
)
509 .arg(totalHitInTrigramList
)
510 .arg(averageBiRanking
)
511 .arg(averageTriRanking
), "OK");
515 void CSequencer::sequenceASentence(int& biRank
, int& triRank
, QString inputSentence
)
521 QMap
<QString
, int> bagOfWords
;
522 QMap
<QString
, int> currentBagOfWords
;
523 QMap
<QString
, int>::Iterator StringToIntIt1
, StringToIntIt2
;
524 QString rightSentence
;
525 QString oneWord
, twoWord
;
526 QString oneBigram
, bestBigram
;
529 double bigramcount
, bestbigramcount
;
530 Q3SortedList
<sentenceItem
> finalRankedSentences
;
531 Q3SortedList
<sentenceItem
> biResultRankedSentences
;
532 Q3SortedList
<sentenceItem
> triResultRankedSentences
;
533 sentenceItem
* oneSentenceItem
;
534 QString resultSentence
;
540 finalRankedSentences
.setAutoDelete(FALSE
);
541 biResultRankedSentences
.setAutoDelete(TRUE
);
542 triResultRankedSentences
.setAutoDelete(TRUE
);
544 if ( m_totalbigrams
== 0)
546 QMessageBox::information ( NULL
, "Warning","Please Read Training Corpus Firstly!", "OK" );
553 if ( inputSentence
.length() ==0)
555 aSentence
= QInputDialog::getText(
556 "Sequencer_ASentence", "Enter a Sentence", QLineEdit::Normal
,
557 QString::null
, &ok
, NULL
);
559 getInputFromUI
= true;
563 aSentence
= inputSentence
;
565 getInputFromUI
= false;
568 if ( ok
&& !aSentence
.isEmpty() )
570 // Parse this sentence into words
571 QString leftWord
, rightWord
;
575 aSentence
= aSentence
.lower();
576 aSentence
= aSentence
.stripWhiteSpace();
577 aSentence
= aSentence
.simplifyWhiteSpace();
579 rightSentence
= QString("#");
582 loc
= aSentence
.find(" ");
585 leftWord
= aSentence
.left(loc
);
586 rightWord
= aSentence
.right(aSentence
.length() - loc
- 1);
588 if (( leftWord
!= QString("#")) && (leftWord
!= QString(".")))
591 if ( bagOfWords
.contains(leftWord
))
593 bagOfWords
[leftWord
]++;
597 bagOfWords
.insert(leftWord
, 1);
600 rightSentence
= rightSentence
+ " " + leftWord
;
604 aSentence
= rightWord
;
605 loc
= aSentence
.find(" ");
608 leftWord
= aSentence
;
610 if ( leftWord
!= QString("."))
612 if (leftWord
.right(1) == QString("."))
614 leftWord
= leftWord
.left(leftWord
.length() -1);
617 if (( leftWord
!= QString("#")) && (leftWord
!= QString(".")))
620 if ( bagOfWords
.contains(leftWord
))
622 bagOfWords
[leftWord
]++;
626 bagOfWords
.insert(leftWord
, 1);
629 rightSentence
= rightSentence
+ " " + leftWord
;
636 rightSentence
= rightSentence
+ " ." ;
638 if ( numberOfWords
== 1)
640 resultSentence
= QString("# ") + (bagOfWords
.begin()).key() + QString(" .") ;
641 //QMessageBox::information ( NULL, "All-Debug",resultSentence, "OK" );
648 //QMessageBox::information ( NULL, "Debug",QString("bag of word has %1 words.").arg(numberOfWords), "OK" );
650 bestbigramcount
=0.0;
651 for ( StringToIntIt1
= bagOfWords
.begin(); StringToIntIt1
!= bagOfWords
.end(); StringToIntIt1
++)
653 oneWord
= StringToIntIt1
.key();
654 count
= StringToIntIt1
.data();
658 reasonablePair
= true;
662 reasonablePair
= false;
665 for ( StringToIntIt2
= bagOfWords
.begin(); StringToIntIt2
!= bagOfWords
.end(); StringToIntIt2
++)
667 twoWord
= StringToIntIt2
.key();
669 if (( !reasonablePair
) && (oneWord
!= twoWord
))
671 reasonablePair
= true;
674 if ( !reasonablePair
)
679 oneBigram
= oneWord
+ separator
+ twoWord
;
681 if ( m_bigramprob
.contains(oneBigram
))
684 bigramcount
= m_bigramprob
[oneBigram
];
686 currentBagOfWords
= bagOfWords
;
688 if (currentBagOfWords
[oneWord
] <= 1)
690 currentBagOfWords
.remove(oneWord
);
694 currentBagOfWords
[oneWord
]--;
697 if (currentBagOfWords
[twoWord
] <= 1)
699 currentBagOfWords
.remove(twoWord
);
703 currentBagOfWords
[twoWord
]--;
706 currentBagOfWords
.insert(QString("#"), 1);
707 currentBagOfWords
.insert(QString("."), 1);
710 // Test bigram sequencer
712 sequenizeFromABigram(oneBigram
, bigramcount
, currentBagOfWords
, numberOfWords
, m_K
, finalRankedSentences
, 1);
714 for ( oneSentenceItem
=finalRankedSentences
.first(); oneSentenceItem
!= 0; oneSentenceItem
=finalRankedSentences
.next())
716 biResultRankedSentences
.append(oneSentenceItem
);
720 finalRankedSentences
.setAutoDelete(FALSE
);
721 finalRankedSentences
.clear();
724 // Test trigram sequencer
726 sequenizeFromABigram(oneBigram
, bigramcount
, currentBagOfWords
, numberOfWords
, m_K
, finalRankedSentences
, 2);
728 for ( oneSentenceItem
=finalRankedSentences
.first(); oneSentenceItem
!= 0; oneSentenceItem
=finalRankedSentences
.next())
730 triResultRankedSentences
.append(oneSentenceItem
);
733 finalRankedSentences
.setAutoDelete(FALSE
);
734 finalRankedSentences
.clear();
746 if ( biResultRankedSentences
.count() != 0)
749 QFile
file( "SequencerLog.txt" );
751 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
753 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
757 Q3TextStream
outf( &file
);
759 outf
<< "******Bigram Results******" << endl
<<endl
;
766 biResultRankedSentences
.sort();
769 biRank
= resultRanki
;
770 for ( oneSentenceItem
=biResultRankedSentences
.first(); oneSentenceItem
!= 0; oneSentenceItem
=biResultRankedSentences
.next())
774 biResultKey
= oneSentenceItem
->m_key
;
775 biResultKey
.replace(separator
, " ");
777 if ( biResultKey
== rightSentence
)
779 biRank
= resultRanki
;
783 if ( resultRanki
> m_resultK
)
789 // log history of this result sentence
790 outf
<< "Result Sentence Rank " << resultRanki
<< " : " << biResultKey
<< endl
;
791 for ( int stepi
= 1; stepi
<= oneSentenceItem
->m_stepnumber
; stepi
++)
793 QString oneHistoryString
;
794 double oneHistoryScore
;
796 oneHistoryString
= oneSentenceItem
->m_historystrings
[stepi
];
797 oneHistoryString
= oneHistoryString
.replace(separator
, " ");
798 oneHistoryScore
= oneSentenceItem
->m_historyscores
[stepi
];
800 outf
<< " " << stepi
<< " : " << oneHistoryString
<< " : " << oneHistoryScore
<< endl
;
809 if ( getInputFromUI
)
813 oneSentenceItem
=biResultRankedSentences
.first();
815 biResultKey
= oneSentenceItem
->m_key
;
816 biResultKey
.replace(separator
, " ");
818 QMessageBox::information ( NULL
, "Top Bigram Sequencer Output",biResultKey
, "OK" );
824 biResultRankedSentences
.clear();
830 if ( triResultRankedSentences
.count() != 0)
833 QFile
file( "SequencerLog.txt" );
835 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
837 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
841 Q3TextStream
outf( &file
);
843 outf
<< "******Trigram Results******" << endl
<<endl
;
846 QString triResultKey
;
848 triResultRankedSentences
.sort();
851 triRank
= resultRanki
;
852 for ( oneSentenceItem
=triResultRankedSentences
.first(); oneSentenceItem
!= 0; oneSentenceItem
=triResultRankedSentences
.next())
856 triResultKey
= oneSentenceItem
->m_key
;
857 triResultKey
.replace(separator
, " ");
859 if ( triResultKey
== rightSentence
)
861 triRank
= resultRanki
;
865 if ( resultRanki
> m_resultK
)
870 // log history of this result sentence
871 outf
<< "Result Sentence Rank " << resultRanki
<< " : " << triResultKey
<< endl
;
872 for ( int stepi
= 1; stepi
<= oneSentenceItem
->m_stepnumber
; stepi
++)
874 QString oneHistoryString
;
875 double oneHistoryScore
;
877 oneHistoryString
= oneSentenceItem
->m_historystrings
[stepi
];
878 oneHistoryString
= oneHistoryString
.replace(separator
, " ");
879 oneHistoryScore
= oneSentenceItem
->m_historyscores
[stepi
];
881 outf
<< " " << stepi
<< " : " << oneHistoryString
<< " : " << oneHistoryScore
<< endl
;
891 if ( getInputFromUI
)
893 QString triResultKey
;
895 oneSentenceItem
=triResultRankedSentences
.first();
897 triResultKey
= oneSentenceItem
->m_key
;
898 triResultKey
.replace(separator
, " ");
900 QMessageBox::information ( NULL
, "Top Trigram Sequencer Output",triResultKey
, "OK" );
905 triResultRankedSentences
.clear();
921 void CSequencer::sequenceASentence2(int& biRank
, int& triRank
, QString inputSentence
)
927 QMap
<QString
, double> allBigrams
;
928 QMap
<QString
, int> bagOfWords
;
929 QMap
<QString
, int> currentBagOfWords
;
930 QMap
<QString
, int>::Iterator StringToIntIt1
, StringToIntIt2
;
931 QString rightSentence
;
932 QString oneWord
, twoWord
;
933 QString oneBigram
, bestBigram
;
936 double bigramcount
, bestbigramcount
;
937 Q3SortedList
<sentenceItem
> finalRankedSentences
;
938 Q3SortedList
<sentenceItem
> biResultRankedSentences
;
939 Q3SortedList
<sentenceItem
> triResultRankedSentences
;
940 sentenceItem
* oneSentenceItem
, *twoSentenceItem
;
941 QString resultSentence
;
944 QMap
<int, QString
> currentHistoryString
;
945 QMap
<int, double> currentHistoryScore
;
950 finalRankedSentences
.setAutoDelete(FALSE
);
951 biResultRankedSentences
.setAutoDelete(TRUE
);
952 triResultRankedSentences
.setAutoDelete(TRUE
);
954 if ( m_totalbigrams
== 0)
956 QMessageBox::information ( NULL
, "Warning","Please Read Training Corpus Firstly!", "OK" );
963 if ( inputSentence
.length() ==0)
965 aSentence
= QInputDialog::getText(
966 "Sequencer_ASentence", "Enter a Sentence", QLineEdit::Normal
,
967 QString::null
, &ok
, NULL
);
969 getInputFromUI
= true;
973 aSentence
= inputSentence
;
975 getInputFromUI
= false;
978 if ( ok
&& !aSentence
.isEmpty() )
980 // Parse this sentence into words
981 QString leftWord
, rightWord
;
985 aSentence
= aSentence
.lower();
986 aSentence
= aSentence
.stripWhiteSpace();
987 aSentence
= aSentence
.simplifyWhiteSpace();
989 rightSentence
= QString("#");
992 loc
= aSentence
.find(" ");
995 leftWord
= aSentence
.left(loc
);
996 rightWord
= aSentence
.right(aSentence
.length() - loc
- 1);
998 if (( leftWord
!= QString("#")) && (leftWord
!= QString(".")))
1001 if ( bagOfWords
.contains(leftWord
))
1003 bagOfWords
[leftWord
]++;
1007 bagOfWords
.insert(leftWord
, 1);
1010 rightSentence
= rightSentence
+ " " + leftWord
;
1014 aSentence
= rightWord
;
1015 loc
= aSentence
.find(" ");
1018 leftWord
= aSentence
;
1020 if ( leftWord
!= QString("."))
1022 if (leftWord
.right(1) == QString("."))
1024 leftWord
= leftWord
.left(leftWord
.length() -1);
1027 if (( leftWord
!= QString("#")) && (leftWord
!= QString(".")))
1030 if ( bagOfWords
.contains(leftWord
))
1032 bagOfWords
[leftWord
]++;
1036 bagOfWords
.insert(leftWord
, 1);
1039 rightSentence
= rightSentence
+ " " + leftWord
;
1046 rightSentence
= rightSentence
+ " ." ;
1048 if ( numberOfWords
== 1)
1050 resultSentence
= QString("# ") + (bagOfWords
.begin()).key() + QString(" .") ;
1051 //QMessageBox::information ( NULL, "All-Debug",resultSentence, "OK" );
1058 //QMessageBox::information ( NULL, "Debug",QString("bag of word has %1 words.").arg(numberOfWords), "OK" );
1060 bestbigramcount
=0.0;
1061 for ( StringToIntIt1
= bagOfWords
.begin(); StringToIntIt1
!= bagOfWords
.end(); StringToIntIt1
++)
1063 oneWord
= StringToIntIt1
.key();
1064 count
= StringToIntIt1
.data();
1068 reasonablePair
= true;
1072 reasonablePair
= false;
1075 for ( StringToIntIt2
= bagOfWords
.begin(); StringToIntIt2
!= bagOfWords
.end(); StringToIntIt2
++)
1077 twoWord
= StringToIntIt2
.key();
1079 if (( !reasonablePair
) && (oneWord
!= twoWord
))
1081 reasonablePair
= true;
1084 if ( !reasonablePair
)
1089 oneBigram
= oneWord
+ separator
+ twoWord
;
1091 if ( m_bigramprob
.contains(oneBigram
))
1095 bigramcount
= m_bigramprob
[oneBigram
];
1097 // bigram bag of words
1098 currentBagOfWords
.clear();
1099 currentBagOfWords
= bagOfWords
;
1101 if (currentBagOfWords
[oneWord
] <= 1)
1103 currentBagOfWords
.remove(oneWord
);
1107 currentBagOfWords
[oneWord
]--;
1110 if (currentBagOfWords
[twoWord
] <= 1)
1112 currentBagOfWords
.remove(twoWord
);
1116 currentBagOfWords
[twoWord
]--;
1119 currentBagOfWords
.insert(QString("#"), 1);
1120 currentBagOfWords
.insert(QString("."), 1);
1122 currentHistoryString
.clear();
1123 currentHistoryScore
.clear();
1125 oneSentenceItem
= new sentenceItem(bigramcount
, oneBigram
, currentBagOfWords
, 1, currentHistoryString
, currentHistoryScore
);
1126 oneSentenceItem
->m_numberofwordsinsentence
= 2;
1127 oneSentenceItem
->m_value
= oneSentenceItem
->m_value
/ (double)oneSentenceItem
->m_numberofwordsinsentence
;
1129 twoSentenceItem
= new sentenceItem(0, oneBigram
, currentBagOfWords
, 1, currentHistoryString
, currentHistoryScore
);
1130 twoSentenceItem
->m_numberofwordsinsentence
= 2;
1131 twoSentenceItem
->m_value
= twoSentenceItem
->m_value
/ (double)twoSentenceItem
->m_numberofwordsinsentence
;
1133 biResultRankedSentences
.append(oneSentenceItem
);
1134 triResultRankedSentences
.append(twoSentenceItem
);
1146 // Test Bigram Sequencerizer2
1147 sequenize2(bagOfWords
, numberOfWords
, m_K
, biResultRankedSentences
, 1);
1149 // Test Trigram Sequencerizer2
1150 sequenize2(bagOfWords
, numberOfWords
, m_K
, triResultRankedSentences
, 2);
1153 if ( biResultRankedSentences
.count() != 0)
1156 QFile
file( "SequencerLog2.txt" );
1158 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
1160 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
1164 Q3TextStream
outf( &file
);
1166 outf
<< "******Bigram Results******" << endl
<<endl
;
1170 QString biResultKey
;
1173 biResultRankedSentences
.sort();
1176 biRank
= resultRanki
;
1177 for ( oneSentenceItem
=biResultRankedSentences
.first(); oneSentenceItem
!= 0; oneSentenceItem
=biResultRankedSentences
.next())
1181 biResultKey
= oneSentenceItem
->m_key
;
1182 biResultKey
.replace(separator
, " ");
1184 if ( biResultKey
== rightSentence
)
1186 biRank
= resultRanki
;
1190 if ( resultRanki
> m_resultK
)
1196 // log history of this result sentence
1197 outf
<< "Result Sentence Rank " << resultRanki
<< " : " << biResultKey
<< endl
;
1198 for ( int stepi
= 1; stepi
<= oneSentenceItem
->m_stepnumber
; stepi
++)
1200 QString oneHistoryString
;
1201 double oneHistoryScore
;
1203 oneHistoryString
= oneSentenceItem
->m_historystrings
[stepi
];
1204 oneHistoryString
= oneHistoryString
.replace(separator
, " ");
1205 oneHistoryScore
= oneSentenceItem
->m_historyscores
[stepi
];
1207 outf
<< " " << stepi
<< " : " << oneHistoryString
<< " : " << oneHistoryScore
<< endl
;
1216 if ( getInputFromUI
)
1220 oneSentenceItem
=biResultRankedSentences
.first();
1222 biResultKey
= oneSentenceItem
->m_key
;
1223 biResultKey
.replace(separator
, " ");
1225 QMessageBox::information ( NULL
, "Top Bigram Sequencer Output",biResultKey
, "OK" );
1229 biResultRankedSentences
.setAutoDelete(TRUE
);
1230 biResultRankedSentences
.clear();
1236 if ( triResultRankedSentences
.count() != 0)
1239 QFile
file( "SequencerLog2.txt" );
1241 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
1243 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
1247 Q3TextStream
outf( &file
);
1249 outf
<< "******Trigram Results******" << endl
<<endl
;
1252 QString triResultKey
;
1254 triResultRankedSentences
.sort();
1257 triRank
= resultRanki
;
1258 for ( oneSentenceItem
=triResultRankedSentences
.first(); oneSentenceItem
!= 0; oneSentenceItem
=triResultRankedSentences
.next())
1262 triResultKey
= oneSentenceItem
->m_key
;
1263 triResultKey
.replace(separator
, " ");
1265 if ( triResultKey
== rightSentence
)
1267 triRank
= resultRanki
;
1271 if ( resultRanki
> m_resultK
)
1276 // log history of this result sentence
1277 outf
<< "Result Sentence Rank " << resultRanki
<< " : " << triResultKey
<< endl
;
1278 for ( int stepi
= 1; stepi
<= oneSentenceItem
->m_stepnumber
; stepi
++)
1280 QString oneHistoryString
;
1281 double oneHistoryScore
;
1283 oneHistoryString
= oneSentenceItem
->m_historystrings
[stepi
];
1284 oneHistoryString
= oneHistoryString
.replace(separator
, " ");
1285 oneHistoryScore
= oneSentenceItem
->m_historyscores
[stepi
];
1287 outf
<< " " << stepi
<< " : " << oneHistoryString
<< " : " << oneHistoryScore
<< endl
;
1297 if ( getInputFromUI
)
1299 QString triResultKey
;
1301 oneSentenceItem
=triResultRankedSentences
.first();
1303 triResultKey
= oneSentenceItem
->m_key
;
1304 triResultKey
.replace(separator
, " ");
1306 QMessageBox::information ( NULL
, "Top Trigram Sequencer Output",triResultKey
, "OK" );
1310 triResultRankedSentences
.setAutoDelete(TRUE
);
1311 triResultRankedSentences
.clear();
1327 void CSequencer::sequenizeFromABigram(
1330 QMap
<QString
, int>& bagOfWords
,
1333 Q3SortedList
<sentenceItem
>& resultKSentences
,
1336 // computeType: 1 --> bigram; 2 --> trigram
1338 QMap
<QString
, int>::Iterator StringToIntIt
;
1339 double currentValue
;
1340 QString currentString
;
1341 QString currentleftBigrambase
, currentrightBigrambase
;
1342 QString currentleftTrigrambase
, currentrightTrigrambase
;
1343 QMap
<QString
, int>* currentBagOfWords
;
1345 int leftFirstLoc
, rightFirstLoc
;
1346 int leftSecondLoc
, rightSecondLoc
;
1347 int lenOfSeparator
= separator
.length();
1348 QMap
<QString
, int> oneTryBagOfWords
;
1349 Q3SortedList
<sentenceItem
> tempResultKSentence
;
1350 Q3SortedList
<sentenceItem
> swapResultKSentence
;
1351 sentenceItem
* oneSentenceItem
;
1352 sentenceItem
* oneCurrentItem
;
1354 bool canExpandLeft
, canExpandRight
;
1356 QMap
<int, QString
> currentHistoryString
;
1357 QMap
<int, double> currentHistoryScore
;
1358 int currentStepNumber
;
1361 resultKSentences
.setAutoDelete( TRUE
);
1362 tempResultKSentence
.setAutoDelete( FALSE
);
1363 resultKSentences
.clear();
1366 // First, put this bigram in resultKSentences;
1367 currentHistoryString
.clear();
1368 currentHistoryScore
.clear();
1369 oneSentenceItem
= new sentenceItem(bigramValue
, oneBigram
, bagOfWords
, 1, currentHistoryString
, currentHistoryScore
);
1370 resultKSentences
.append(oneSentenceItem
);
1373 // Loop for lenOfSentence( abc doesn't count the beginning "#" and ending ".")
1374 // Each loop refers to expand one word either from left or right
1376 for ( i
=0; i
<lenOfSentence
; i
++)
1378 // for each expansion. Total : N loops
1380 if ( i
== lenOfSentence
-1)
1390 tempResultKSentence
.clear();
1391 for ( oneCurrentItem
=resultKSentences
.first(); oneCurrentItem
!= 0; oneCurrentItem
=resultKSentences
.next())
1393 // for each current string. Total: K loops
1395 currentString
= oneCurrentItem
->m_key
;
1396 currentValue
= oneCurrentItem
->m_value
;
1397 currentBagOfWords
= &(oneCurrentItem
->m_bagofwords
);
1398 currentHistoryString
.clear();
1399 currentHistoryString
= oneCurrentItem
->m_historystrings
;
1400 currentHistoryScore
.clear();
1401 currentHistoryScore
= oneCurrentItem
->m_historyscores
;
1402 currentStepNumber
= oneCurrentItem
->m_stepnumber
;
1404 leftFirstLoc
= currentString
.find(separator
);
1405 currentleftBigrambase
= currentString
.left(leftFirstLoc
);
1406 tempString
= currentString
.right(currentString
.length() - leftFirstLoc
- lenOfSeparator
);
1407 leftSecondLoc
= tempString
.find(separator
);
1409 if ( leftSecondLoc
== -1)
1411 currentleftTrigrambase
= currentString
;
1415 currentleftTrigrambase
= currentString
.left(leftFirstLoc
+ lenOfSeparator
+ leftSecondLoc
);
1419 rightFirstLoc
= currentString
.findRev(separator
);
1420 currentrightBigrambase
= currentString
.right(currentString
.length() - rightFirstLoc
- lenOfSeparator
);
1421 tempString
= currentString
.left(rightFirstLoc
);
1422 rightSecondLoc
= tempString
.findRev(separator
);
1424 if ( rightSecondLoc
== -1)
1426 currentrightTrigrambase
= currentString
;
1430 currentrightTrigrambase
= currentString
.right(currentString
.length() - rightSecondLoc
- lenOfSeparator
);
1434 if (currentleftBigrambase
== QString("#"))
1436 canExpandLeft
= false;
1440 canExpandLeft
= true;
1444 if (currentrightBigrambase
== QString("."))
1446 canExpandRight
= false;
1450 canExpandRight
= true;
1454 if ( (!canExpandLeft
) && (!canExpandRight
))
1459 for ( StringToIntIt
= currentBagOfWords
->begin(); StringToIntIt
!= currentBagOfWords
->end(); StringToIntIt
++)
1461 // For each possible word. Total : (N - M) loops
1463 QString onePossibleExpansion
;
1464 QString leftExpansion
;
1465 QString rightExpansion
;
1469 onePossibleExpansion
= StringToIntIt
.key();
1472 // Try Left Expansion
1473 if (( onePossibleExpansion
!= QString(".")) && (canExpandLeft
))
1475 if ((onePossibleExpansion
== QString("#")) && (!canExpandRight
) && (!lastWord
))
1481 leftExpansion
= onePossibleExpansion
+ separator
+ currentString
;
1483 if ( computeType
== 1)
1485 QString oneTryBigram
;
1488 oneTryBigram
= onePossibleExpansion
+ separator
+ currentleftBigrambase
;
1490 if (! m_bigramprob
.contains(oneTryBigram
))
1492 if (m_bigramsbase
.contains(onePossibleExpansion
))
1494 oneValue
= 10.0; // big punishment
1498 oneValue
= 5.0; // mild punishment
1504 oneValue
= m_bigramprob
[oneTryBigram
];
1507 leftValue
= currentValue
+ oneValue
;
1510 else if ( computeType
== 2)
1513 QString oneTryTrigram
;
1514 QString oneTryTrigrambase
;
1517 oneTryTrigrambase
= onePossibleExpansion
+ separator
+ currentleftBigrambase
;
1518 oneTryTrigram
= onePossibleExpansion
+ separator
+ currentleftTrigrambase
;
1522 if (! m_trigramprob
.contains(oneTryTrigram
))
1524 if (m_trigramsbase
.contains(oneTryTrigrambase
))
1526 oneValue
= 10.0; // big punishment
1530 oneValue
= 5.0; // mild punishment
1536 oneValue
= m_trigramprob
[oneTryTrigram
];
1539 leftValue
= currentValue
+ oneValue
;
1548 // create a sentenceItem
1549 oneTryBagOfWords
= (*currentBagOfWords
);
1550 if ( oneTryBagOfWords
[onePossibleExpansion
] > 1)
1552 oneTryBagOfWords
[onePossibleExpansion
]--;
1556 oneTryBagOfWords
.remove(onePossibleExpansion
);
1559 oneSentenceItem
= new sentenceItem(leftValue
, leftExpansion
, oneTryBagOfWords
, currentStepNumber
+1, currentHistoryString
, currentHistoryScore
);
1561 tempResultKSentence
.append(oneSentenceItem
);
1566 // Try Right Expansion
1567 if ( (onePossibleExpansion
!= QString("#")) && (canExpandRight
))
1569 if ((onePossibleExpansion
== QString(".")) && (!canExpandLeft
) && (!lastWord
))
1574 rightExpansion
= currentString
+ separator
+ onePossibleExpansion
;
1576 if ( computeType
== 1)
1578 QString oneTryBigram
;
1581 oneTryBigram
= currentrightBigrambase
+ separator
+ onePossibleExpansion
;
1583 if (! m_bigramprob
.contains(oneTryBigram
))
1585 if (m_bigramsbase
.contains(currentrightBigrambase
))
1587 oneValue
= 10.0; // big punishment
1591 oneValue
= 5.0; // mild punishment
1597 oneValue
= m_bigramprob
[oneTryBigram
];
1600 rightValue
= currentValue
+ oneValue
;
1603 else if ( computeType
== 2)
1606 QString oneTryTrigram
;
1607 QString oneTryTrigrambase
;
1610 oneTryTrigram
= currentrightTrigrambase
+ separator
+ onePossibleExpansion
;
1611 oneTryTrigrambase
= currentrightTrigrambase
;
1614 if (! m_trigramprob
.contains(oneTryTrigram
))
1616 if (m_trigramsbase
.contains(oneTryTrigrambase
))
1618 oneValue
= 10.0; // big punishment
1622 oneValue
= 5.0; // mild punishment
1628 oneValue
= m_trigramprob
[oneTryTrigram
];
1631 rightValue
= currentValue
+ oneValue
;
1640 // create a sentenceItem
1641 oneTryBagOfWords
= (*currentBagOfWords
);;
1642 if ( oneTryBagOfWords
[onePossibleExpansion
] > 1)
1644 oneTryBagOfWords
[onePossibleExpansion
]--;
1648 oneTryBagOfWords
.remove(onePossibleExpansion
);
1651 oneSentenceItem
= new sentenceItem(rightValue
, rightExpansion
, oneTryBagOfWords
, currentStepNumber
+1, currentHistoryString
, currentHistoryScore
);
1653 tempResultKSentence
.append(oneSentenceItem
);
1661 tempResultKSentence
.sort();
1664 // Delete the duplicate Items
1668 preString
= QString("");
1669 swapResultKSentence
.clear();
1670 for ( oneCurrentItem
=tempResultKSentence
.first(); oneCurrentItem
!= 0; oneCurrentItem
=tempResultKSentence
.next())
1672 postString
= oneCurrentItem
->m_key
;
1674 if (! (postString
== preString
) )
1676 swapResultKSentence
.append(oneCurrentItem
);
1677 preString
= postString
;
1681 delete oneCurrentItem
;
1687 tempResultKSentence
= swapResultKSentence
;
1690 // Keep the top K item in this sentence list
1691 if ( static_cast <int> ( tempResultKSentence
.count() ) > K
)
1695 diff
= tempResultKSentence
.count() - K
;
1697 tempResultKSentence
.setAutoDelete( TRUE
);
1698 for ( j
= 0; j
< diff
; j
++)
1700 tempResultKSentence
.removeLast();
1702 tempResultKSentence
.setAutoDelete( FALSE
);
1705 // copy to ResultKSentence
1706 resultKSentences
.clear();
1707 resultKSentences
= tempResultKSentence
;
1717 void CSequencer::sequenize2(
1718 QMap
<QString
, int> allBagOfWords
,
1721 Q3SortedList
<sentenceItem
>& resultKSentences
,
1724 // computeType: 1 --> bigram; 2 --> trigram
1726 QMap
<QString
, int> testBagOfWords
;
1727 QMap
<QString
, int> testBagOfWords2
;
1728 QMap
<QString
, int>::Iterator StringToIntIt
;
1729 double currentValue
, tryValue
;
1730 QString currentString
, tryString
;
1731 QString currentleftBigrambase
, currentrightBigrambase
;
1732 QString currentleftTrigrambase
, currentrightTrigrambase
;
1733 QString tryleftBigrambase
, tryrightBigrambase
;
1734 QString tryleftTrigrambase
, tryrightTrigrambase
;
1735 QMap
<QString
, int>* currentBagOfWords
;
1736 QMap
<QString
, int>* tryBagOfWords
;
1738 int leftFirstLoc
, rightFirstLoc
;
1739 int leftSecondLoc
, rightSecondLoc
;
1740 int lenOfSeparator
= separator
.length();
1741 QMap
<QString
, int> oneTryBagOfWords
;
1742 Q3SortedList
<sentenceItem
> tempResultKSentence
;
1743 Q3SortedList
<sentenceItem
> swapResultKSentence
;
1744 Q3SortedList
<sentenceItem
> copyResultKSentence
;
1745 sentenceItem
* oneSentenceItem
;
1746 sentenceItem
* oneCurrentItem
;
1747 sentenceItem
* twoCurrentItem
;
1749 bool canExpandLeft
, canExpandRight
;
1750 bool canExpandLeft2
, canExpandRight2
;
1753 QMap
<int, QString
> currentHistoryString
, tryHistoryString
;
1754 QMap
<int, double> currentHistoryScore
, tryHistoryScore
;
1755 int currentStepNumber
;
1756 int currentNumberOfWordsInSentence
, tryNumberOfWordsInSentence
;
1759 bool deleteduplicatesentence
= false;
1762 resultKSentences
.setAutoDelete( TRUE
);
1763 tempResultKSentence
.setAutoDelete( FALSE
);
1764 copyResultKSentence
.setAutoDelete( FALSE
);
1765 swapResultKSentence
.setAutoDelete( FALSE
);
1770 QFile
file( "SequencerLog2Debug.txt" );
1772 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
1774 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
1778 Q3TextStream
outf( &file
);
1779 QString displayTempString
;
1782 outf
<< "******One Sentence******" << endl
<<endl
;
1788 // At most Loop for lenOfSentence( abc doesn't count the beginning "#" and ending ".")
1789 // Each loop refers to expand one step
1791 for ( i
=0; i
<lenOfSentence
; i
++)
1794 shouldFurther
= false;
1796 // for each expansion. Total : N loops
1798 if ( i
== lenOfSentence
-1)
1810 QFile
file( "SequencerLog2Debug.txt" );
1812 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
1814 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
1818 Q3TextStream
outf( &file
);
1819 QString displayTempString
;
1823 outf
<< endl
<< "******" << i
<<"******" << endl
;
1829 tempResultKSentence
.clear();
1831 copyResultKSentence
= resultKSentences
;
1833 for ( oneCurrentItem
=resultKSentences
.first(); oneCurrentItem
!= 0; oneCurrentItem
=resultKSentences
.next())
1835 // for each current string. Total: K loops
1837 currentString
= oneCurrentItem
->m_key
;
1838 currentValue
= oneCurrentItem
->m_value
; // Now, this is an average value;
1839 currentBagOfWords
= &(oneCurrentItem
->m_bagofwords
);
1840 currentHistoryString
.clear();
1841 currentHistoryString
= oneCurrentItem
->m_historystrings
;
1842 currentHistoryScore
.clear();
1843 currentHistoryScore
= oneCurrentItem
->m_historyscores
;
1844 currentStepNumber
= oneCurrentItem
->m_stepnumber
;
1845 currentNumberOfWordsInSentence
= oneCurrentItem
->m_numberofwordsinsentence
;
1852 QFile
file( "SequencerLog2Debug.txt" );
1854 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
1856 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
1860 Q3TextStream
outf( &file
);
1861 QString displayTempString
;
1864 displayTempString
= currentString
;
1865 displayTempString
= displayTempString
.replace(separator
, " ");
1866 outf
<< displayTempString
<< " " << currentValue
<<endl
;
1872 // Get the words of currentString
1873 testBagOfWords
.clear();
1874 testBagOfWords
= allBagOfWords
;
1875 for ( StringToIntIt
= currentBagOfWords
->begin(); StringToIntIt
!= currentBagOfWords
->end(); StringToIntIt
++)
1877 QString oneInCurrentWords
;
1879 oneInCurrentWords
= StringToIntIt
.key();
1881 if ( testBagOfWords
[oneInCurrentWords
] == 1)
1883 testBagOfWords
.remove(oneInCurrentWords
);
1887 testBagOfWords
[oneInCurrentWords
]--;
1893 // this sentence already done
1894 if ( currentNumberOfWordsInSentence
== (lenOfSentence
+2))
1896 oneSentenceItem
= new sentenceItem(oneCurrentItem
);
1898 tempResultKSentence
.append(oneSentenceItem
);
1903 myLastWord
= lastWord
;
1904 if ( currentNumberOfWordsInSentence
== (lenOfSentence
+1))
1911 // Figure out the bigrambase and trigrambase of the current string;
1912 leftFirstLoc
= currentString
.find(separator
);
1913 currentleftBigrambase
= currentString
.left(leftFirstLoc
);
1914 tempString
= currentString
.right(currentString
.length() - leftFirstLoc
- lenOfSeparator
);
1915 leftSecondLoc
= tempString
.find(separator
);
1917 if ( leftSecondLoc
== -1)
1919 currentleftTrigrambase
= currentString
;
1923 currentleftTrigrambase
= currentString
.left(leftFirstLoc
+ lenOfSeparator
+ leftSecondLoc
);
1927 rightFirstLoc
= currentString
.findRev(separator
);
1928 currentrightBigrambase
= currentString
.right(currentString
.length() - rightFirstLoc
- lenOfSeparator
);
1929 tempString
= currentString
.left(rightFirstLoc
);
1930 rightSecondLoc
= tempString
.findRev(separator
);
1932 if ( rightSecondLoc
== -1)
1934 currentrightTrigrambase
= currentString
;
1938 currentrightTrigrambase
= currentString
.right(currentString
.length() - rightSecondLoc
- lenOfSeparator
);
1942 if (currentleftBigrambase
== QString("#"))
1944 canExpandLeft
= false;
1948 canExpandLeft
= true;
1952 if (currentrightBigrambase
== QString("."))
1954 canExpandRight
= false;
1958 canExpandRight
= true;
1962 if ( (!canExpandLeft
) && (!canExpandRight
))
1969 // Consider the single word in currentbagsofWords
1970 for ( StringToIntIt
= currentBagOfWords
->begin(); StringToIntIt
!= currentBagOfWords
->end(); StringToIntIt
++)
1972 // For each possible word. Total : (N - M) loops
1974 QString onePossibleExpansion
;
1975 QString leftExpansion
;
1976 QString rightExpansion
;
1980 onePossibleExpansion
= StringToIntIt
.key();
1983 // Try Left Expansion
1984 if (( onePossibleExpansion
!= QString(".")) && (canExpandLeft
))
1986 if ((onePossibleExpansion
== QString("#")) && (!canExpandRight
) && (!myLastWord
))
1992 leftExpansion
= onePossibleExpansion
+ separator
+ currentString
;
1994 if ( computeType
== 1)
1996 QString oneTryBigram
;
1999 oneTryBigram
= onePossibleExpansion
+ separator
+ currentleftBigrambase
;
2001 if (! m_bigramprob
.contains(oneTryBigram
))
2003 if (m_bigramsbase
.contains(onePossibleExpansion
))
2005 oneValue
= 10.0; // big punishment
2009 oneValue
= 5.0; // mild punishment
2015 oneValue
= m_bigramprob
[oneTryBigram
];
2018 leftValue
= currentValue
*currentNumberOfWordsInSentence
+ oneValue
;
2021 else if ( computeType
== 2)
2024 QString oneTryTrigram
;
2025 QString oneTryTrigrambase
;
2028 oneTryTrigrambase
= onePossibleExpansion
+ separator
+ currentleftBigrambase
;
2029 oneTryTrigram
= onePossibleExpansion
+ separator
+ currentleftTrigrambase
;
2033 if (! m_trigramprob
.contains(oneTryTrigram
))
2035 if (m_trigramsbase
.contains(oneTryTrigrambase
))
2037 oneValue
= 10.0; // big punishment
2041 oneValue
= 5.0; // mild punishment
2047 oneValue
= m_trigramprob
[oneTryTrigram
];
2050 leftValue
= currentValue
*currentNumberOfWordsInSentence
+ oneValue
;
2059 // create a sentenceItem
2060 oneTryBagOfWords
= (*currentBagOfWords
);
2061 if ( oneTryBagOfWords
[onePossibleExpansion
] > 1)
2063 oneTryBagOfWords
[onePossibleExpansion
]--;
2067 oneTryBagOfWords
.remove(onePossibleExpansion
);
2070 oneSentenceItem
= new sentenceItem(leftValue
, leftExpansion
, oneTryBagOfWords
, currentStepNumber
+1, currentHistoryString
, currentHistoryScore
);
2071 oneSentenceItem
->m_numberofwordsinsentence
= currentNumberOfWordsInSentence
+ 1;
2072 oneSentenceItem
->m_value
= oneSentenceItem
->m_value
/ (double)oneSentenceItem
->m_numberofwordsinsentence
;
2074 tempResultKSentence
.append(oneSentenceItem
);
2075 shouldFurther
= true;
2080 // Try Right Expansion
2081 if ( (onePossibleExpansion
!= QString("#")) && (canExpandRight
))
2083 if ((onePossibleExpansion
== QString(".")) && (!canExpandLeft
) && (!myLastWord
))
2088 rightExpansion
= currentString
+ separator
+ onePossibleExpansion
;
2090 if ( computeType
== 1)
2092 QString oneTryBigram
;
2095 oneTryBigram
= currentrightBigrambase
+ separator
+ onePossibleExpansion
;
2097 if (! m_bigramprob
.contains(oneTryBigram
))
2099 if (m_bigramsbase
.contains(currentrightBigrambase
))
2101 oneValue
= 10.0; // big punishment
2105 oneValue
= 5.0; // mild punishment
2111 oneValue
= m_bigramprob
[oneTryBigram
];
2114 rightValue
= currentValue
*currentNumberOfWordsInSentence
+ oneValue
;
2117 else if ( computeType
== 2)
2120 QString oneTryTrigram
;
2121 QString oneTryTrigrambase
;
2124 oneTryTrigram
= currentrightTrigrambase
+ separator
+ onePossibleExpansion
;
2125 oneTryTrigrambase
= currentrightTrigrambase
;
2128 if (! m_trigramprob
.contains(oneTryTrigram
))
2130 if (m_trigramsbase
.contains(oneTryTrigrambase
))
2132 oneValue
= 10.0; // big punishment
2136 oneValue
= 5.0; // mild punishment
2142 oneValue
= m_trigramprob
[oneTryTrigram
];
2145 rightValue
= currentValue
*currentNumberOfWordsInSentence
+ oneValue
;
2154 // create a sentenceItem
2155 oneTryBagOfWords
= (*currentBagOfWords
);;
2156 if ( oneTryBagOfWords
[onePossibleExpansion
] > 1)
2158 oneTryBagOfWords
[onePossibleExpansion
]--;
2162 oneTryBagOfWords
.remove(onePossibleExpansion
);
2165 oneSentenceItem
= new sentenceItem(rightValue
, rightExpansion
, oneTryBagOfWords
, currentStepNumber
+1, currentHistoryString
, currentHistoryScore
);
2166 oneSentenceItem
->m_numberofwordsinsentence
= currentNumberOfWordsInSentence
+ 1;
2167 oneSentenceItem
->m_value
= oneSentenceItem
->m_value
/ (double)oneSentenceItem
->m_numberofwordsinsentence
;
2169 tempResultKSentence
.append(oneSentenceItem
);
2170 shouldFurther
= true;
2178 // If this sentence Item only need one more word, not necessary to consider other chunks;
2179 //myLastWord = true;
2186 // Consider the possible chunk concatenance;
2188 for ( twoCurrentItem
=copyResultKSentence
.first(); twoCurrentItem
!= 0; twoCurrentItem
=copyResultKSentence
.next())
2190 QString leftExpansion
;
2191 QString rightExpansion
;
2197 tryString
= twoCurrentItem
->m_key
;
2198 tryValue
= twoCurrentItem
->m_value
; // Now, this is an average value;
2199 tryBagOfWords
= &(twoCurrentItem
->m_bagofwords
);
2200 tryHistoryString
.clear();
2201 tryHistoryString
= twoCurrentItem
->m_historystrings
;
2202 tryHistoryScore
.clear();
2203 tryHistoryScore
= twoCurrentItem
->m_historyscores
;
2204 tryNumberOfWordsInSentence
= twoCurrentItem
->m_numberofwordsinsentence
;
2208 if ( (currentNumberOfWordsInSentence
+ tryNumberOfWordsInSentence
-2) > lenOfSentence
)
2214 // Figure out the bigrambase and trigrambase of the try string;
2215 leftFirstLoc
= tryString
.find(separator
);
2216 tryleftBigrambase
= tryString
.left(leftFirstLoc
);
2217 tempString
= tryString
.right(tryString
.length() - leftFirstLoc
- lenOfSeparator
);
2218 leftSecondLoc
= tempString
.find(separator
);
2220 if ( leftSecondLoc
== -1)
2222 tryleftTrigrambase
= tryString
;
2226 tryleftTrigrambase
= tryString
.left(leftFirstLoc
+ lenOfSeparator
+ leftSecondLoc
);
2230 rightFirstLoc
= tryString
.findRev(separator
);
2231 tryrightBigrambase
= tryString
.right(tryString
.length() - rightFirstLoc
- lenOfSeparator
);
2232 tempString
= tryString
.left(rightFirstLoc
);
2233 rightSecondLoc
= tempString
.findRev(separator
);
2235 if ( rightSecondLoc
== -1)
2237 tryrightTrigrambase
= tryString
;
2241 tryrightTrigrambase
= tryString
.right(tryString
.length() - rightSecondLoc
- lenOfSeparator
);
2245 // Do quick check in order to save time
2247 if (tryleftBigrambase
== QString("#"))
2249 canExpandLeft2
= false;
2253 canExpandLeft2
= true;
2257 if (tryrightBigrambase
== QString("."))
2259 canExpandRight2
= false;
2263 canExpandRight2
= true;
2268 if ( !canExpandLeft
&& !canExpandRight
)
2273 if ( !canExpandLeft2
&& !canExpandRight2
)
2278 if ( (canExpandLeft
&& !canExpandRight
) && (!canExpandRight2
&& canExpandLeft2
))
2283 if ( (!canExpandLeft
&& canExpandRight
) && (canExpandRight2
&& !canExpandLeft2
))
2288 if ( (canExpandLeft
&& canExpandRight
) && (canExpandRight2
&& canExpandLeft2
))
2290 if (( currentNumberOfWordsInSentence
+ tryNumberOfWordsInSentence
) > lenOfSentence
)
2297 if ( (canExpandLeft
&& !canExpandRight
) && (canExpandRight2
&& !canExpandLeft2
))
2299 if ( (currentNumberOfWordsInSentence
+ tryNumberOfWordsInSentence
-2) < lenOfSentence
)
2305 if ( (canExpandLeft2
&& !canExpandRight2
) && (canExpandRight
&& !canExpandLeft
))
2307 if ( (currentNumberOfWordsInSentence
+ tryNumberOfWordsInSentence
-2) < lenOfSentence
)
2314 // Check wether the two strings overlap some common words.
2315 testBagOfWords2
.clear();
2316 testBagOfWords2
= (*tryBagOfWords
);
2319 for ( StringToIntIt
= testBagOfWords
.begin(); StringToIntIt
!= testBagOfWords
.end(); StringToIntIt
++)
2321 QString oneInTestWords
;
2322 int oneInTestWordCount
;
2325 oneInTestWords
= StringToIntIt
.key();
2326 oneInTestWordCount
= StringToIntIt
.data();
2328 if (! testBagOfWords2
.contains(oneInTestWords
))
2334 if ( testBagOfWords2
[oneInTestWords
] < oneInTestWordCount
)
2339 else if ( testBagOfWords2
[oneInTestWords
] == oneInTestWordCount
)
2341 testBagOfWords2
.remove(oneInTestWords
);
2345 testBagOfWords2
[oneInTestWords
] -= oneInTestWordCount
;
2350 if ( overlapped
) continue;
2353 // Now, these two chunks are ready to merge
2354 oneTryBagOfWords
.clear();
2355 oneTryBagOfWords
= testBagOfWords2
;
2360 if ( canExpandLeft
&& canExpandRight2
)
2363 leftExpansion
= tryString
+ separator
+ currentString
;
2365 if ( computeType
== 1)
2367 QString oneTryBigram
;
2370 oneTryBigram
= tryrightBigrambase
+ separator
+ currentleftBigrambase
;
2372 if (! m_bigramprob
.contains(oneTryBigram
))
2374 if (m_bigramsbase
.contains(tryrightBigrambase
))
2376 oneValue
= 10.0; // big punishment
2380 oneValue
= 5.0; // mild punishment
2386 oneValue
= m_bigramprob
[oneTryBigram
];
2389 leftValue
= currentValue
*currentNumberOfWordsInSentence
+ oneValue
+ tryValue
*tryNumberOfWordsInSentence
;
2392 else if ( computeType
== 2)
2395 QString oneTryTrigram
;
2396 QString oneTryTrigrambase
;
2397 double oneValue
, twoValue
;
2400 oneTryTrigrambase
= tryrightBigrambase
+ separator
+ currentleftBigrambase
;
2401 oneTryTrigram
= tryrightBigrambase
+ separator
+ currentleftTrigrambase
;
2405 if (! m_trigramprob
.contains(oneTryTrigram
))
2407 if (m_trigramsbase
.contains(oneTryTrigrambase
))
2409 oneValue
= 10.0; // big punishment
2413 oneValue
= 5.0; // mild punishment
2419 oneValue
= m_trigramprob
[oneTryTrigram
];
2423 // Special here, one more trigram are taken in.
2424 oneTryTrigrambase
= tryrightTrigrambase
;
2425 oneTryTrigram
= tryrightTrigrambase
+ separator
+ currentleftBigrambase
;
2428 if (! m_trigramprob
.contains(oneTryTrigram
))
2430 if (m_trigramsbase
.contains(oneTryTrigrambase
))
2432 twoValue
= 10.0; // big punishment
2436 twoValue
= 5.0; // mild punishment
2442 twoValue
= m_trigramprob
[oneTryTrigram
];
2446 leftValue
= currentValue
*currentNumberOfWordsInSentence
+ oneValue
+ twoValue
+ tryValue
*tryNumberOfWordsInSentence
;
2454 // create a sentenceItem
2457 for ( StringToIntIt
= oneTryBagOfWords
.begin(); StringToIntIt
!= oneTryBagOfWords
.end(); StringToIntIt
++)
2459 QString oneInTestWords
;
2460 int oneInTestWordCount
;
2462 oneInTestWords
= StringToIntIt
.key();
2463 oneInTestWordCount
= 1;
2470 oneSentenceItem
= new sentenceItem(leftValue
, leftExpansion
, oneTryBagOfWords
, currentStepNumber
+1, currentHistoryString
, currentHistoryScore
);
2471 oneSentenceItem
->m_numberofwordsinsentence
= currentNumberOfWordsInSentence
+ tryNumberOfWordsInSentence
;
2472 oneSentenceItem
->m_value
= oneSentenceItem
->m_value
/ (double)oneSentenceItem
->m_numberofwordsinsentence
;
2474 tempResultKSentence
.append(oneSentenceItem
);
2475 shouldFurther
= true;
2482 if ( canExpandRight
&& canExpandLeft2
)
2485 rightExpansion
= currentString
+ separator
+ tryString
;
2487 if ( computeType
== 1)
2489 QString oneTryBigram
;
2492 oneTryBigram
= currentrightBigrambase
+ separator
+ tryleftBigrambase
;
2494 if (! m_bigramprob
.contains(oneTryBigram
))
2496 if (m_bigramsbase
.contains(tryrightBigrambase
))
2498 oneValue
= 10.0; // big punishment
2502 oneValue
= 5.0; // mild punishment
2508 oneValue
= m_bigramprob
[oneTryBigram
];
2511 rightValue
= currentValue
*currentNumberOfWordsInSentence
+ oneValue
+ tryValue
*tryNumberOfWordsInSentence
;
2514 else if ( computeType
== 2)
2517 QString oneTryTrigram
;
2518 QString oneTryTrigrambase
;
2519 double oneValue
, twoValue
;
2522 oneTryTrigrambase
= currentrightBigrambase
+ separator
+ tryleftBigrambase
;
2523 oneTryTrigram
= currentrightBigrambase
+ separator
+ tryleftTrigrambase
;
2526 if (! m_trigramprob
.contains(oneTryTrigram
))
2528 if (m_trigramsbase
.contains(oneTryTrigrambase
))
2530 oneValue
= 10.0; // big punishment
2534 oneValue
= 5.0; // mild punishment
2540 oneValue
= m_trigramprob
[oneTryTrigram
];
2544 // Special here, one more trigram are taken in.
2545 oneTryTrigrambase
= currentrightTrigrambase
;
2546 oneTryTrigram
= currentrightTrigrambase
+ separator
+ tryleftBigrambase
;
2549 if (! m_trigramprob
.contains(oneTryTrigram
))
2551 if (m_trigramsbase
.contains(oneTryTrigrambase
))
2553 twoValue
= 10.0; // big punishment
2557 twoValue
= 5.0; // mild punishment
2563 twoValue
= m_trigramprob
[oneTryTrigram
];
2567 rightValue
= currentValue
*currentNumberOfWordsInSentence
+ oneValue
+ twoValue
+ tryValue
*tryNumberOfWordsInSentence
;
2575 // create a sentenceItem
2576 oneSentenceItem
= new sentenceItem(rightValue
, rightExpansion
, oneTryBagOfWords
, currentStepNumber
+1, currentHistoryString
, currentHistoryScore
);
2577 oneSentenceItem
->m_numberofwordsinsentence
= currentNumberOfWordsInSentence
+ tryNumberOfWordsInSentence
;
2578 oneSentenceItem
->m_value
= oneSentenceItem
->m_value
/ (double)oneSentenceItem
->m_numberofwordsinsentence
;
2580 tempResultKSentence
.append(oneSentenceItem
);
2581 shouldFurther
= true;
2592 tempResultKSentence
.sort();
2594 // Delete the duplicate Items
2596 if ( deleteduplicatesentence
)
2601 preString
= QString("");
2602 swapResultKSentence
.clear();
2603 for ( oneCurrentItem
=tempResultKSentence
.first(); oneCurrentItem
!= 0; oneCurrentItem
=tempResultKSentence
.next())
2605 postString
= oneCurrentItem
->m_key
;
2607 if (! (postString
== preString
) )
2609 swapResultKSentence
.append(oneCurrentItem
);
2610 preString
= postString
;
2614 delete oneCurrentItem
;
2620 tempResultKSentence
= swapResultKSentence
;
2624 // Keep the top K item in this sentence list
2625 if ( static_cast <int> ( tempResultKSentence
.count() ) > K
)
2629 diff
= tempResultKSentence
.count() - K
;
2631 tempResultKSentence
.setAutoDelete( TRUE
);
2632 for ( j
= 0; j
< diff
; j
++)
2634 tempResultKSentence
.removeLast();
2636 tempResultKSentence
.setAutoDelete( FALSE
);
2639 // copy to ResultKSentence
2640 resultKSentences
.clear();
2641 resultKSentences
= tempResultKSentence
;
2643 if ( !shouldFurther
)