Fixed leaks in LM::operator[](const char *) and LM::clear_oov()
[vspell.git] / libvspell / spell.h
blob1509d3b5c166e3e80a46413f5c018a965df558e8
1 #ifndef __SPELL_H__ // -*- tab-width: 2 mode: c++ -*-
2 #define __SPELL_H__
4 #ifdef __TYPES_H__
5 #include "types.h"
6 #endif
8 #ifndef __DICTIONARY_H__
9 #include "dictionary.h"
10 #endif
12 #ifndef __WORDNODE_H__
13 #include "wordnode.h"
14 #endif
16 #ifndef __VECTOR__
17 #include <vector>
18 #endif
19 #ifndef __SET__
20 #include <set>
21 #endif
22 #ifndef __STRING__
23 #include <string>
24 #endif
25 #ifndef __IOSTREAM__
26 #include <iostream>
27 #endif
28 #ifndef __SSTREAM__
29 #include <sstream>
30 #endif
32 #ifndef BOOST_SHARED_PTR_HPP_INCLUDED
33 #include <boost/shared_ptr.hpp>
34 #endif
36 class Sentence;
38 /**
39 Store information of a word in a sentence.
42 struct WordEntry {
43 unsigned int pos; /// syllable index
44 unsigned int len; /// word len
45 unsigned int fuzid; /// to identify different WordEntry with the same pos/len.
46 /// fuzid is a mask of fuzzy/exact match.
47 unsigned int id; /// index in WordEntries
49 DNNode<LeafNNode> node; /// Word content
51 bool operator < (const WordEntry &we) const {
52 return pos != we.pos ? pos < we.pos :
53 (len != we.len ? len < we.len :
54 (fuzid != we.fuzid ? fuzid < we.fuzid : node.node < we.node.node));
58 std::ostream& operator << (std::ostream &os,const WordEntry &we);
59 std::istream& operator >> (std::istream &is,WordEntry &we);
61 typedef WordEntry* WordEntryRef;
62 typedef std::vector<WordEntry> WordEntries;
63 typedef std::vector<WordEntryRef> WordEntryRefs;
66 /**
67 Store WordInfo(s) which have a specified length
70 class WordInfos {
71 public:
72 WordInfos();
73 unsigned int exact_len;
74 WordEntryRefs fuzzy_map; /// contains all WordEntry which are fuzzy at this position
75 WordEntryRefs we; /// contains all WordEntry which started at this pos
78 class WordState;
79 typedef std::vector<WordState*> WordStates;
81 /**
82 Store information used by WFST::get_all_words().
83 This is a self-destroy object. get_next(), get_first() will destroy itself if necessary.
85 class WordState {
86 protected:
87 WordState(const WordState&);
88 void add_word(std::set<WordEntry> &we,LeafNNode*);
90 /**
91 the currently processing node
93 DNNode<BranchNNode> dnode;
94 const Sentence &sent;
95 //bool survive;
97 int fuzid;
98 int pos;
99 int len;
101 public:
102 WordState(const Sentence &st):fuzid(0),sent(st),pos(0) {}
103 virtual void get_first(WordStates &states,uint pos);
104 virtual void get_next(WordStates &states) = 0; // you have to delete your self after this if your task is done
105 virtual void collect_words(std::set<WordEntry> &we);
109 struct WordStateFactory {
110 virtual void create_new(WordStates &states,uint pos,const Sentence &st) const = 0;
113 typedef std::vector<WordStateFactory*> WordStateFactories;
115 #define WORDSTATEFACTORY(CLASS)\
116 struct CLASS##Factory:public WordStateFactory { \
117 virtual void create_new(WordStates &states,uint pos,const Sentence &st) const {\
118 (new CLASS(st))->get_first(states,pos);\
123 Store WordInfos(s) started at specified positions.
126 class Lattice {
127 protected:
128 std::vector<WordInfos> wi;
130 public:
132 boost::shared_ptr<WordEntries> we;
133 boost::shared_ptr<const Sentence> st;
135 void construct(const boost::shared_ptr<const Sentence> &st);
136 void pre_construct(const boost::shared_ptr<const Sentence> &st,std::set<WordEntry> &wes,const WordStateFactories &f);
137 void post_construct(std::set<WordEntry> &wes);
139 /// Get the number of available positions, from 0 to n-1
140 unsigned int get_word_count() const {
141 return wi.size();
145 Get maximal length of words at specified position.
146 \param i specify a position in sentence
148 unsigned int get_len(unsigned int i) const;
151 Get the length of the exact words at specified position.
152 \param i specify a position in sentence
154 unsigned int get_exact_len(unsigned int i) const {
155 return wi[i].exact_len;
159 Get fuzzy map at specified position.
160 \param i specify a position
163 const WordEntryRefs& get_fuzzy_map(unsigned int i) const {
164 return wi[i].fuzzy_map;
168 Get the all WordEntry(s) at specified pos.
169 \param pos specify a position.
172 const WordEntryRefs& get_we(unsigned int pos) const {
173 return wi[pos].we;
176 ~Lattice(); // WARN: destroy all.
179 Construct Lattice based on member we.
180 we must be valid.
183 void construct();
186 Construct Lattice based on another Lattice.
187 Only exact matches are copied.
188 \param w specify the "template" Lattice
191 void based_on(const Lattice &w);
194 Add WordEntry w into Lattice.
197 void add(WordEntry &w);
199 friend std::ostream& operator << (std::ostream& os,const Lattice &w);
200 friend std::istream& operator >> (std::istream& is,Lattice &w);
203 struct ExactWordState:public WordState {
204 ExactWordState(const Sentence &st):WordState(st) {}
205 void get_next(WordStates &states);
207 WORDSTATEFACTORY(ExactWordState);
209 struct LowerWordState:public WordState {
210 LowerWordState(const Sentence &st):WordState(st) {}
211 void get_next(WordStates &states);
213 WORDSTATEFACTORY(LowerWordState);
215 struct UpperWordState:public LowerWordState {
216 UpperWordState(const Sentence &st):LowerWordState(st) {}
217 void collect_words(std::set<WordEntry> &we);
219 WORDSTATEFACTORY(UpperWordState);
221 struct FuzzyWordState:public WordState {
222 FuzzyWordState(const Sentence &st):WordState(st) {}
223 void get_next(WordStates &states);
225 WORDSTATEFACTORY(FuzzyWordState);
227 struct CaseWordState:public WordState {
228 CaseWordState(const Sentence &st):WordState(st) {}
229 void get_next(WordStates &states);
231 WORDSTATEFACTORY(CaseWordState);
235 Sentence is used to store a sequence of syllables.
236 Sentence and Lattice will keep all necessary info for a spelling checker.
237 "Sentence" here is not exactly a sentence. It's just a part of sentence
238 separated by punctuation.
241 class Sentence
243 public:
244 class Syllable
246 private:
247 public:
248 unsigned int start;
249 strid id; /// real string
250 strid cid; /// lowercased string
251 //strid iid,icid;
252 //std::string::iterator start,end;
253 Sentence *sent_;
254 unsigned int category;
255 unsigned int span;
257 strid get_id() const { return id; }
258 strid get_cid() const { return cid; }
261 private:
262 std::string sent_;
263 std::vector<Syllable> syllables;
264 friend class Syllable;
266 void tokenize_punctuation(const std::string &s,std::vector<std::string> &ss);
268 public:
269 Sentence() {}
270 Sentence(const Lattice& l);
271 Sentence(std::istream &is);
272 Sentence(const std::string &st):sent_(st) {}
273 void set(const std::string &st) { sent_ = st; syllables.clear(); }
274 std::string get() const { return sent_; }
275 void tokenize();
276 void standardize();
277 unsigned int get_syllable_count() const { return syllables.size(); }
278 // void get_word_number() { return word.size(); }
279 Syllable& operator[] (unsigned int i) { return syllables[i]; }
280 Syllable operator[] (unsigned int i) const { return syllables[i]; }
281 // Syllable& operator[] (int i) { return syllables[i]; }
282 bool is_contiguous(unsigned int i); // i & i+1 is contiguous ?
283 void merge(unsigned int i);
284 friend std::ostream& operator <<(std::ostream &os, const Sentence &st);
287 typedef Sentence* SentenceRef;
290 Segmentation store a sequence of WordEntry index.
291 From WordEntry we can get the real word.
294 struct Segmentation : public std::vector<uint>
296 boost::shared_ptr<WordEntries> we; /// WordEntries associated with Segmentation
297 float prob; /// total prob
298 int distance; /// total distance
300 Segmentation(boost::shared_ptr<WordEntries> _we = boost::shared_ptr<WordEntries>()):
301 we(_we),
302 prob(0),
303 distance(0) {}
305 Segmentation(const Segmentation&seg):std::vector<uint>(seg) {
306 prob = seg.prob;
307 distance = seg.distance;
308 we = seg.we;
311 const WordEntry& operator[] (int id) const {
312 return (*we)[std::vector<uint>::operator[](id)];
314 friend std::ostream& operator <<(std::ostream &os,const Segmentation &seg);
315 std::ostream& pretty_print(std::ostream &os,const Sentence &st);
319 typedef std::vector<Segmentation> Segmentations;
321 struct Suggestion {
322 int id;
323 std::vector<strid> suggestions;
326 typedef std::vector<Suggestion> Suggestions;
328 void apply_separator(std::set<WordEntry> &wes,int p);
330 //namespace Spell {
331 //void spell_check1(Sentence &st,Suggestions &s);
332 //void spell_check2(Sentence &st,Segmentation &seg,Suggestions &s);
336 #endif