1 #ifndef __SPELL_H__ // -*- tab-width: 2 mode: c++ -*-
8 #ifndef __DICTIONARY_H__
9 #include "dictionary.h"
12 #ifndef __WORDNODE_H__
32 #ifndef BOOST_SHARED_PTR_HPP_INCLUDED
33 #include <boost/shared_ptr.hpp>
39 Store information of a word in a sentence.
43 unsigned int pos
; /// syllable index
44 unsigned int len
; /// word len
45 unsigned int fuzid
; /// to identify different WordEntry with the same pos/len.
46 /// fuzid is a mask of fuzzy/exact match.
47 unsigned int id
; /// index in WordEntries
49 DNNode
<LeafNNode
> node
; /// Word content
51 bool operator < (const WordEntry
&we
) const {
52 return pos
!= we
.pos
? pos
< we
.pos
:
53 (len
!= we
.len
? len
< we
.len
:
54 (fuzid
!= we
.fuzid
? fuzid
< we
.fuzid
: node
.node
< we
.node
.node
));
58 std::ostream
& operator << (std::ostream
&os
,const WordEntry
&we
);
59 std::istream
& operator >> (std::istream
&is
,WordEntry
&we
);
61 typedef WordEntry
* WordEntryRef
;
62 typedef std::vector
<WordEntry
> WordEntries
;
63 typedef std::vector
<WordEntryRef
> WordEntryRefs
;
67 Store WordInfo(s) which have a specified length
73 unsigned int exact_len
;
74 WordEntryRefs fuzzy_map
; /// contains all WordEntry which are fuzzy at this position
75 WordEntryRefs we
; /// contains all WordEntry which started at this pos
79 typedef std::vector
<WordState
*> WordStates
;
82 Store information used by WFST::get_all_words().
83 This is a self-destroy object. get_next(), get_first() will destroy itself if necessary.
87 WordState(const WordState
&);
88 void add_word(std::set
<WordEntry
> &we
,LeafNNode
*);
91 the currently processing node
93 DNNode
<BranchNNode
> dnode
;
102 WordState(const Sentence
&st
):fuzid(0),sent(st
),pos(0) {}
103 virtual void get_first(WordStates
&states
,uint pos
);
104 virtual void get_next(WordStates
&states
) = 0; // you have to delete your self after this if your task is done
105 virtual void collect_words(std::set
<WordEntry
> &we
);
109 struct WordStateFactory
{
110 virtual void create_new(WordStates
&states
,uint pos
,const Sentence
&st
) const = 0;
113 typedef std::vector
<WordStateFactory
*> WordStateFactories
;
115 #define WORDSTATEFACTORY(CLASS)\
116 struct CLASS##Factory:public WordStateFactory { \
117 virtual void create_new(WordStates &states,uint pos,const Sentence &st) const {\
118 (new CLASS(st))->get_first(states,pos);\
123 Store WordInfos(s) started at specified positions.
128 std::vector
<WordInfos
> wi
;
132 boost::shared_ptr
<WordEntries
> we
;
133 boost::shared_ptr
<const Sentence
> st
;
135 void construct(const boost::shared_ptr
<const Sentence
> &st
);
136 void pre_construct(const boost::shared_ptr
<const Sentence
> &st
,std::set
<WordEntry
> &wes
,const WordStateFactories
&f
);
137 void post_construct(std::set
<WordEntry
> &wes
);
139 /// Get the number of available positions, from 0 to n-1
140 unsigned int get_word_count() const {
145 Get maximal length of words at specified position.
146 \param i specify a position in sentence
148 unsigned int get_len(unsigned int i
) const;
151 Get the length of the exact words at specified position.
152 \param i specify a position in sentence
154 unsigned int get_exact_len(unsigned int i
) const {
155 return wi
[i
].exact_len
;
159 Get fuzzy map at specified position.
160 \param i specify a position
163 const WordEntryRefs
& get_fuzzy_map(unsigned int i
) const {
164 return wi
[i
].fuzzy_map
;
168 Get the all WordEntry(s) at specified pos.
169 \param pos specify a position.
172 const WordEntryRefs
& get_we(unsigned int pos
) const {
176 ~Lattice(); // WARN: destroy all.
179 Construct Lattice based on member we.
186 Construct Lattice based on another Lattice.
187 Only exact matches are copied.
188 \param w specify the "template" Lattice
191 void based_on(const Lattice
&w
);
194 Add WordEntry w into Lattice.
197 void add(WordEntry
&w
);
199 friend std::ostream
& operator << (std::ostream
& os
,const Lattice
&w
);
200 friend std::istream
& operator >> (std::istream
& is
,Lattice
&w
);
203 struct ExactWordState
:public WordState
{
204 ExactWordState(const Sentence
&st
):WordState(st
) {}
205 void get_next(WordStates
&states
);
207 WORDSTATEFACTORY(ExactWordState
);
209 struct LowerWordState
:public WordState
{
210 LowerWordState(const Sentence
&st
):WordState(st
) {}
211 void get_next(WordStates
&states
);
213 WORDSTATEFACTORY(LowerWordState
);
215 struct UpperWordState
:public LowerWordState
{
216 UpperWordState(const Sentence
&st
):LowerWordState(st
) {}
217 void collect_words(std::set
<WordEntry
> &we
);
219 WORDSTATEFACTORY(UpperWordState
);
221 struct FuzzyWordState
:public WordState
{
222 FuzzyWordState(const Sentence
&st
):WordState(st
) {}
223 void get_next(WordStates
&states
);
225 WORDSTATEFACTORY(FuzzyWordState
);
227 struct CaseWordState
:public WordState
{
228 CaseWordState(const Sentence
&st
):WordState(st
) {}
229 void get_next(WordStates
&states
);
231 WORDSTATEFACTORY(CaseWordState
);
235 Sentence is used to store a sequence of syllables.
236 Sentence and Lattice will keep all necessary info for a spelling checker.
237 "Sentence" here is not exactly a sentence. It's just a part of sentence
238 separated by punctuation.
249 strid id
; /// real string
250 strid cid
; /// lowercased string
252 //std::string::iterator start,end;
254 unsigned int category
;
257 strid
get_id() const { return id
; }
258 strid
get_cid() const { return cid
; }
263 std::vector
<Syllable
> syllables
;
264 friend class Syllable
;
266 void tokenize_punctuation(const std::string
&s
,std::vector
<std::string
> &ss
);
270 Sentence(const Lattice
& l
);
271 Sentence(std::istream
&is
);
272 Sentence(const std::string
&st
):sent_(st
) {}
273 void set(const std::string
&st
) { sent_
= st
; syllables
.clear(); }
274 std::string
get() const { return sent_
; }
277 unsigned int get_syllable_count() const { return syllables
.size(); }
278 // void get_word_number() { return word.size(); }
279 Syllable
& operator[] (unsigned int i
) { return syllables
[i
]; }
280 Syllable
operator[] (unsigned int i
) const { return syllables
[i
]; }
281 // Syllable& operator[] (int i) { return syllables[i]; }
282 bool is_contiguous(unsigned int i
); // i & i+1 is contiguous ?
283 void merge(unsigned int i
);
284 friend std::ostream
& operator <<(std::ostream
&os
, const Sentence
&st
);
287 typedef Sentence
* SentenceRef
;
290 Segmentation store a sequence of WordEntry index.
291 From WordEntry we can get the real word.
294 struct Segmentation
: public std::vector
<uint
>
296 boost::shared_ptr
<WordEntries
> we
; /// WordEntries associated with Segmentation
297 float prob
; /// total prob
298 int distance
; /// total distance
300 Segmentation(boost::shared_ptr
<WordEntries
> _we
= boost::shared_ptr
<WordEntries
>()):
305 Segmentation(const Segmentation
&seg
):std::vector
<uint
>(seg
) {
307 distance
= seg
.distance
;
311 const WordEntry
& operator[] (int id
) const {
312 return (*we
)[std::vector
<uint
>::operator[](id
)];
314 friend std::ostream
& operator <<(std::ostream
&os
,const Segmentation
&seg
);
315 std::ostream
& pretty_print(std::ostream
&os
,const Sentence
&st
);
319 typedef std::vector
<Segmentation
> Segmentations
;
323 std::vector
<strid
> suggestions
;
326 typedef std::vector
<Suggestion
> Suggestions
;
328 void apply_separator(std::set
<WordEntry
> &wes
,int p
);
331 //void spell_check1(Sentence &st,Suggestions &s);
332 //void spell_check2(Sentence &st,Segmentation &seg,Suggestions &s);