libvspell/sections.cpp

   1 #include "config.h"                                                     // -*- tab-width: 2 -*-
   2 #include "wfst.h"
   3 #include <iterator>
   4 #include <algorithm>
   5 #include <sstream>
   6 #include <iostream>
   7 #include <fstream>
   8 #include <boost/format.hpp>
   9 #include <set>
  10
  11
  12 using namespace std;
  13
  14
  15 /**
  16  * dump a Sections
  17  */
  18
  19 std::ostream& operator << (std::ostream &os,const Sections &me)
  20 {
  21         using namespace boost;
  22         const Sentence &_sent = *me.st;
  23         unsigned int i,ii,nn,n = _sent.get_syllable_count();
  24         ii = 0;
  25         nn = me.size();
  26         for (i = 0;i < n;i ++) {
  27                 if (ii < nn && me[ii].start == i)
  28                         os << "[";
  29                 os << format("%s") % get_sarch()[_sent[i].id];
  30                 if (ii < nn && me[ii].start+me[ii].len-1 == i) {
  31                         os << "]" << me[ii].len;
  32                         ii ++;
  33                 }
  34                 os << " ";
  35         }
  36         return os;
  37 }
  38
  39 /**
  40  Split Lattice into Sections
  41  \param words input
  42  \param sects output
  43  */
  44
  45 void Sections::construct(const Lattice &words)
  46 {
  47         Sections& sects = *this;
  48
  49         // mark all possible words. All bounds left is really bounds.
  50         // because we need at most n-1 boundary syllable for n-gram
  51         // if two ambiguous sections is near than n-1 syllables, then merge them.
  52         unsigned int i,ii,n,nn;
  53
  54         sects.st = words.st;
  55
  56         n = words.get_word_count();
  57
  58         vector<uint> bound(n);
  59
  60         for (i = 0;i < n;i ++) {
  61                 nn = words.get_len(i);
  62                 for (ii = 0;ii < nn-2;ii ++)
  63                         bound[ii+i] = 1;
  64         }
  65         if (!bound.size())
  66                 return;
  67         bound[bound.size()-1] = 1; // it's obvious there is a boundary in the end
  68
  69         //copy(bound.begin(),bound.end(),ostream_iterator<int>(cerr," "));
  70
  71         // "tokenize" _sent
  72         int pos,len = bound.size();
  73         Section sect;
  74
  75         sect.start = 0;
  76         sect.len = 0;
  77
  78         for (pos = 0;pos < len;pos ++) {
  79                 // ignore "1" boundaries
  80                 if (bound[pos])
  81                         continue;
  82
  83                 bool is_section;
  84                 // just write down and figure out what the formulas mean
  85                 sect.len = pos - sect.start + 1;
  86                 is_section = words.get_len(sect.start) > 2 ||
  87                         (words.get_len(sect.start) >= 2 &&
  88                          !words.get_fuzzy_map(sect.start).empty());
  89
  90                 if (is_section) {
  91                         // now merge two sections (this and the previous) if needed
  92                         if (!sects.empty()) {
  93                                 Section &prev = sects.back();
  94                                 if (sect.start - (prev.start + prev.len) < NGRAM_LENGTH-1)
  95                                         prev.len = pos - prev.start + 1; // merge
  96                                 else
  97                                         sects.push_back(sect); // not merge
  98                         } else
  99                                 sects.push_back(sect);  // sects is empty -> nothing to merge
 100                 }
 101                 sect.start = pos+1;
 102         }
 103
 104         if (sects.empty()) {
 105                 sect.start=0;
 106                 sect.len = n-sect.start;
 107                 sects.push_back(sect);
 108         }
 109 }
 110
 111 std::ostream& operator <<(std::ostream &os,const Segmentation &seg)
 112 {
 113            int i,n = seg.size();
 114         for (i = 0;i < n;i ++)
 115                         os << "[" << seg[i].node << "] ";
 116         return os;
 117 }
 118
 119 void Section::segment_best(const Lattice &w,Segmentation &final_seg)
 120 {
 121         Segmentation seg(w.we);
 122         Segmentor segtor;
 123         final_seg.prob = 1000000;
 124
 125         segtor.init(w,                                                          // Lattice
 126                                                         start,                                          // from
 127                                                         start+len-1);                   // to
 128
 129         VocabIndex *vi = new VocabIndex[NGRAM_LENGTH];
 130         while (segtor.step(seg)) {
 131                 // compute ngram. take the best seg.
 132                 seg.prob = 0;
 133                 vi[NGRAM_LENGTH] = Vocab_None;
 134                 for (unsigned int ii = NGRAM_LENGTH-1;ii < seg.size();ii ++) {
 135                         for (unsigned int j = 0;j < NGRAM_LENGTH-1;j++)
 136                                 vi[j] = seg[ii-1-j].node.node->get_id();
 137                         seg.prob += -get_ngram().wordProb(seg[ii].node.node->get_id(),vi);
 138                 }
 139
 140                 if (seg.prob < final_seg.prob)
 141                         final_seg = seg;
 142
 143                 //cerr << seg << " " << seg.prob << endl;
 144         }
 145
 146         delete[] vi;
 147 }