libvspell/wfst.cpp

   1 #include "config.h"                                                     // -*- tab-width: 2 -*-
   2 #include "wfst.h"
   3 #include <iterator>
   4 #include <algorithm>
   5 #include <sstream>
   6 #include <iostream>
   7 #include <fstream>
   8 #include <boost/format.hpp>
   9 #include <set>
  10 #include "posgen.h"
  11 #include <signal.h>
  12 #include <unistd.h>
  13
  14 using namespace std;
  15
  16
  17
  18 typedef vector<Segmentations> Segmentation2;
  19
  20 /**
  21         Generate new Lattice info based on misspelled position info.
  22         Call WFST::get_sections() to split into sections
  23         then call WFST::segment_all1() to do real segmentation.
  24         \param pos contains possibly misspelled position.
  25         \param len specify the actual length pos. Don't use pos.size()
  26 */
  27
  28 void WFST::generate_misspelled_words(const vector<uint> &pos,int len,Segmentation &final_seg)
  29 {
  30         const Lattice &words = *p_words;
  31         Lattice w;
  32
  33         w.based_on(words);
  34
  35         // 2. Compute the score, jump to 1. if score is too low (pruning 1)
  36
  37         // create new (more compact) Lattice structure
  38         int i,n = words.get_word_count();
  39         for (i = 0;i < len;i ++) {
  40                 const WordEntryRefs &fmap = words.get_fuzzy_map(pos[i]);
  41                 int ii,nn = fmap.size();
  42                 for (ii = 0;ii < nn;++ii)
  43                         w.add(*fmap[ii]);
  44         }
  45
  46         //cerr << w << endl;
  47
  48         // 4. Create sections
  49         Sections sects;
  50         sects.construct(words);
  51
  52         // 5. Call create_base_segmentation
  53         //Segmentation base_seg(words.we);
  54         //create_base_segmentation(words,sects,base_seg);
  55
  56
  57         // 6. Get the best segmentation of each section,
  58         // then merge to one big segment.
  59         n = sects.size();
  60
  61         uint ii,nn;
  62
  63         i = ii = 0;
  64         nn = words.get_word_count();
  65
  66         final_seg.clear();
  67         while (ii < nn)
  68                 if (i < n && sects[i].start == ii) {
  69                         Segmentation seg;
  70                         sects[i].segment_best(words,seg);
  71                         copy(seg.begin(),
  72                                          seg.end(),
  73                                          back_insert_iterator< Segmentation >(final_seg));
  74                         ii += sects[i].len;
  75                         i ++;
  76                 } else {
  77                         // only word(i,*,0) exists
  78                         final_seg.push_back(words.get_we(ii)[0]->id);
  79                         ii += words.get_we(ii)[0]->len;
  80                 }
  81 }
  82
  83 /**
  84         Create the best segmentation for a sentence.
  85         \param words store Lattice info
  86         \param seps return the best segmentation
  87  */
  88
  89 void WFST::segment_best(const Lattice &words,Segmentation &seps)
  90 {
  91         //int i,ii,n,nn;
  92
  93         p_words = &words;
  94
  95         // in test mode, generate all positions where misspelled can appear,
  96         // then create a new Lattice for them, re get_sections,
  97         // create_base_segmentation and call segment_all1 for each sections.
  98
  99         // 1. Generate mispelled positions (pruning 0 - GA)
 100         // 2. Compute the score, jump to 1. if score is too low (pruning 1)
 101         // 3. Make a new Lattice based on the original Lattice
 102         // 4. Call get_sections
 103         // 5. Call create_base_segmentation
 104         // 6. Call segment_all1 for each sections.
 105         // 6.1. Recompute the score after each section processed. (pruning 2)
 106
 107         // 1. Bai toan hoan vi, tinh chap C(k,n) with modification. C(1,n)+C(2,n)+...+C(k,n)
 108         Generator gen;
 109
 110         gen.init(*words.st);
 111         vector<uint> pos;
 112         uint len;
 113         seps.prob = 100;
 114         while (gen.step(pos,len)) {
 115                 Segmentation seg;
 116                 //cerr << "POS :";
 117                 //for (int i = 0;i < len;i ++) cerr << pos[i];
 118                 //cerr << endl;
 119                 generate_misspelled_words(pos,len,seg);
 120                 if (seg.prob < seps.prob)
 121                         seps = seg;
 122         }
 123         gen.done();
 124
 125 }
 126
 127 /**
 128         Create the best segmentation for a sentence. The biggest difference between
 129         segment_best and segment_best_no_fuzzy is that segment_best_no_fuzzy don't
 130         use Generator. It assumes there is no misspelled position at all.
 131         \param words store Lattice info
 132         \param seps return the best segmentation
 133  */
 134
 135 void WFST::segment_best_no_fuzzy(const Lattice &words,Segmentation &seps)
 136 {
 137         p_words = &words;
 138
 139         vector<uint> pos;
 140         generate_misspelled_words(pos,0,seps);
 141 }
 142
 143 // WFST (Weighted Finite State Transducer) implementation
 144 // TUNE: think about genetic/greedy. Vietnamese is almost one-syllable words..
 145 // we find where is likely a start of a word, then try to construct word
 146 // and check if others are still valid words.
 147
 148 // the last item is always the incompleted item. We will try to complete
 149 // a word from the item. If we reach the end of sentence, we will remove it
 150 // from segs
 151
 152 // obsolete
 153 void WFST::segment_all(const Sentence &sent,vector<Segmentation> &result)
 154 {
 155         Lattice words;
 156         words.construct(sent);
 157         //      segment_all1(sent,words,0,sent.get_syllable_count(),result);a
 158         /*
 159                 int nn = words.size();
 160                 for (i = 0;i < nn;i ++) {
 161                 int nnn = words[i].size();
 162                 cerr << "From " << i << endl;
 163                 for (int ii = 0;ii < nnn;ii ++) {
 164                 int nnnn = words[i][ii].fuzzy_match.size();
 165                 cerr << "Len " << ii << endl;
 166                 for (int iii = 0;iii < nnnn;iii ++) {
 167                 cerr << words[i][ii].fuzzy_match[iii].distance << " ";
 168                 cerr << words[i][ii].fuzzy_match[iii].node->get_prob() << endl;
 169                 }
 170                 }
 171                 }
 172         */
 173
 174 }
 175
 176 /**
 177  * Segmentation comparator
 178  */
 179
 180 class SegmentationComparator
 181 {
 182 public:
 183         bool operator() (const Segmentation &seg1,const Segmentation &seg2) {
 184                 return seg1.prob > seg2.prob;
 185         }
 186 };
 187
 188
 189
 190
 191
 192
 193 void Segmentor::init(const Lattice &words,
 194                                                                                  int from,
 195                                                                                  int to)
 196 {
 197         nr_syllables = to+1;
 198
 199         segs.clear();
 200         segs.reserve(100);
 201
 202         Trace trace(words.we);
 203         trace.next_syllable = from;
 204         segs.push_back(trace);  // empty seg
 205
 206         _words = &words;
 207 }
 208
 209 bool Segmentor::step(Segmentation &result)
 210 {
 211         const Lattice &words = *_words;
 212         //      const Sentence &sent = *_words->st;
 213         while (!segs.empty()) {
 214                 // get one
 215                 Trace trace = segs.back();
 216                 segs.pop_back();
 217
 218                 Segmentation seg = trace.s;
 219                 int next_syllable = trace.next_syllable;
 220
 221                 // segmentation completed. throw it away
 222                 if (next_syllable >= nr_syllables)
 223                         continue;
 224
 225                 //WordEntries::iterator i_entry;
 226                 const WordEntryRefs &wes = words.get_we(next_syllable);
 227                 int ii,nn = wes.size();
 228                 for (ii = 0;ii < nn;ii ++) {
 229                         WordEntryRef i_entry = wes[ii];
 230
 231                         // New segmentation for longer incomplete word
 232                         Trace newtrace(words.we);
 233                         newtrace = trace;
 234                         newtrace.s.push_back(i_entry->id);
 235                         newtrace.s.prob += i_entry->node.node->get_prob();
 236
 237                         /*-it's better to compute ngram outside this function
 238                         if (ngram_enabled) {
 239                                 if (newtrace.s.size() >= NGRAM_LENGTH) {
 240                                         VocabIndex *vi = new VocabIndex[NGRAM_LENGTH];
 241                                         vi[0] = newtrace.s.items[len-1].node(sent).node->get_id();
 242                                         vi[1] = Vocab_None;
 243                                         newtrace.s.prob += -ngram.wordProb(newtrace.s.items[len-2].node(sent).node->get_id(),vi);
 244                                         delete[] vi;
 245                                 }
 246                         }
 247                         */
 248
 249                         newtrace.next_syllable += i_entry->len;
 250                         if (newtrace.next_syllable == nr_syllables) {
 251                                 //cout << count << " " << newtrace.s << endl;
 252                                 result = newtrace.s;
 253                                 return true;
 254                                 //result.push_back(newtrace.s);
 255                                 //push_heap(result.begin(),result.end(),SegmentationComparator());
 256                                 //count ++;
 257                         } else {
 258                                 segs.push_back(newtrace);
 259                         }
 260                 }
 261         } // end while
 262         return false;
 263 }
 264
 265 void Segmentor::done()
 266 {
 267 }