fixed bugs in Text::penalty2_construct(), Penalty2DAG::set_syllable_weights()
[vspell.git] / libvspell / sections.cpp
blob2f6894d1ee570e86fee6e3d9f047cd79fd949879
1 #include "config.h" // -*- tab-width: 2 -*-
2 #include "wfst.h"
3 #include <iterator>
4 #include <algorithm>
5 #include <sstream>
6 #include <iostream>
7 #include <fstream>
8 #include <boost/format.hpp>
9 #include <set>
12 using namespace std;
15 /**
16 * dump a Sections
19 std::ostream& operator << (std::ostream &os,const Sections &me)
21 using namespace boost;
22 const Sentence &_sent = *me.st;
23 unsigned int i,ii,nn,n = _sent.get_syllable_count();
24 ii = 0;
25 nn = me.size();
26 for (i = 0;i < n;i ++) {
27 if (ii < nn && me[ii].start == i)
28 os << "[";
29 os << format("%s") % get_sarch()[_sent[i].id];
30 if (ii < nn && me[ii].start+me[ii].len-1 == i) {
31 os << "]" << me[ii].len;
32 ii ++;
34 os << " ";
36 return os;
39 /**
40 Split Lattice into Sections
41 \param words input
42 \param sects output
45 void Sections::construct(const Lattice &words)
47 Sections& sects = *this;
49 // mark all possible words. All bounds left is really bounds.
50 // because we need at most n-1 boundary syllable for n-gram
51 // if two ambiguous sections is near than n-1 syllables, then merge them.
52 unsigned int i,ii,n,nn;
54 sects.st = words.st;
56 n = words.get_word_count();
58 vector<uint> bound(n);
60 for (i = 0;i < n;i ++) {
61 nn = words.get_len(i);
62 for (ii = 0;ii < nn-2;ii ++)
63 bound[ii+i] = 1;
65 if (!bound.size())
66 return;
67 bound[bound.size()-1] = 1; // it's obvious there is a boundary in the end
69 //copy(bound.begin(),bound.end(),ostream_iterator<int>(cerr," "));
71 // "tokenize" _sent
72 int pos,len = bound.size();
73 Section sect;
75 sect.start = 0;
76 sect.len = 0;
78 for (pos = 0;pos < len;pos ++) {
79 // ignore "1" boundaries
80 if (bound[pos])
81 continue;
83 bool is_section;
84 // just write down and figure out what the formulas mean
85 sect.len = pos - sect.start + 1;
86 is_section = words.get_len(sect.start) > 2 ||
87 (words.get_len(sect.start) >= 2 &&
88 !words.get_fuzzy_map(sect.start).empty());
90 if (is_section) {
91 // now merge two sections (this and the previous) if needed
92 if (!sects.empty()) {
93 Section &prev = sects.back();
94 if (sect.start - (prev.start + prev.len) < NGRAM_LENGTH-1)
95 prev.len = pos - prev.start + 1; // merge
96 else
97 sects.push_back(sect); // not merge
98 } else
99 sects.push_back(sect); // sects is empty -> nothing to merge
101 sect.start = pos+1;
104 if (sects.empty()) {
105 sect.start=0;
106 sect.len = n-sect.start;
107 sects.push_back(sect);
111 std::ostream& operator <<(std::ostream &os,const Segmentation &seg)
113 int i,n = seg.size();
114 for (i = 0;i < n;i ++)
115 os << "[" << seg[i].node << "] ";
116 return os;
119 void Section::segment_best(const Lattice &w,Segmentation &final_seg)
121 Segmentation seg(w.we);
122 Segmentor segtor;
123 final_seg.prob = 1000000;
125 segtor.init(w, // Lattice
126 start, // from
127 start+len-1); // to
129 VocabIndex *vi = new VocabIndex[NGRAM_LENGTH];
130 while (segtor.step(seg)) {
131 // compute ngram. take the best seg.
132 seg.prob = 0;
133 vi[NGRAM_LENGTH] = Vocab_None;
134 for (unsigned int ii = NGRAM_LENGTH-1;ii < seg.size();ii ++) {
135 for (unsigned int j = 0;j < NGRAM_LENGTH-1;j++)
136 vi[j] = seg[ii-1-j].node.node->get_id();
137 seg.prob += -get_ngram().wordProb(seg[ii].node.node->get_id(),vi);
140 if (seg.prob < final_seg.prob)
141 final_seg = seg;
143 //cerr << seg << " " << seg.prob << endl;
146 delete[] vi;