From 4182598a937f80f980593b5739a8f22ce63ad2cd Mon Sep 17 00:00:00 2001 From: =?utf8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Fri, 13 Feb 2004 11:07:10 +0000 Subject: [PATCH] Libvspell now using new engine. Everything is stored in WordEntries. Possible mispelled positions are generated. --- libvspell/Doxyfile | 175 +++++++++++ libvspell/Makefile.am | 1 + libvspell/Makefile.in | 81 ++--- libvspell/dictionary.cpp | 49 ++- libvspell/posgen.h | 33 ++ libvspell/sections.cpp | 116 +++++++ libvspell/sentence.cpp | 133 +++++++- libvspell/spell.cpp | 4 +- libvspell/spell.h | 200 +++++++++++- libvspell/syllable.h | 2 +- libvspell/wfst.cpp | 800 ++++++++++++++++++++--------------------------- libvspell/wfst.h | 81 +++-- libvspell/wordnode.h | 15 +- libvspell/words.cpp | 244 +++++++++++++++ 14 files changed, 1367 insertions(+), 567 deletions(-) create mode 100644 libvspell/Doxyfile create mode 100644 libvspell/posgen.h create mode 100644 libvspell/sections.cpp rewrite libvspell/wfst.cpp (83%) create mode 100644 libvspell/words.cpp diff --git a/libvspell/Doxyfile b/libvspell/Doxyfile new file mode 100644 index 0000000..4895160 --- /dev/null +++ b/libvspell/Doxyfile @@ -0,0 +1,175 @@ +# Doxyfile 1.2.12-20011209 + +#--------------------------------------------------------------------------- +# General configuration options +#--------------------------------------------------------------------------- +PROJECT_NAME = LibVspell +PROJECT_NUMBER = +OUTPUT_DIRECTORY = +OUTPUT_LANGUAGE = English +EXTRACT_ALL = YES +EXTRACT_PRIVATE = YES +EXTRACT_STATIC = YES +EXTRACT_LOCAL_CLASSES = YES +HIDE_UNDOC_MEMBERS = NO +HIDE_UNDOC_CLASSES = NO +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = YES +STRIP_FROM_PATH = $(PWD)/ +INTERNAL_DOCS = NO +STRIP_CODE_COMMENTS = YES +CASE_SENSE_NAMES = NO +SHORT_NAMES = NO +HIDE_SCOPE_NAMES = NO +VERBATIM_HEADERS = YES +SHOW_INCLUDE_FILES = YES +JAVADOC_AUTOBRIEF = YES +INHERIT_DOCS = YES +INLINE_INFO = YES +SORT_MEMBER_DOCS = YES +DISTRIBUTE_GROUP_DOC = NO +TAB_SIZE = 8 +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +ALIASES = +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +OPTIMIZE_OUTPUT_FOR_C = NO +SHOW_USED_FILES = YES +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- +QUIET = NO +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_FORMAT = "$file:$line: $text" +WARN_LOGFILE = +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- +INPUT = wfst.cpp dictionary.cpp distance.cpp sentence.cpp spell.cpp debug.h distance.h sentence.h syllable.h wordnode.h dictionary.h propername.h spell.h wfst.h + + +FILE_PATTERNS = *.h +RECURSIVE = NO +EXCLUDE_PATTERNS = +EXAMPLE_PATH = +EXAMPLE_PATTERNS = +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_SOURCE_FILES = NO +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- +SOURCE_BROWSER = NO +INLINE_SOURCES = NO +REFERENCED_BY_RELATION = YES +REFERENCES_RELATION = YES +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 5 +IGNORE_PREFIX = +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- +GENERATE_HTML = YES +HTML_OUTPUT = +HTML_HEADER = +HTML_FOOTER = +HTML_STYLESHEET = +HTML_ALIGN_MEMBERS = YES +GENERATE_HTMLHELP = NO +GENERATE_CHI = NO +BINARY_TOC = NO +TOC_EXPAND = NO +DISABLE_INDEX = NO +ENUM_VALUES_PER_LINE = 4 +GENERATE_TREEVIEW = YES +TREEVIEW_WIDTH = 250 +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- +GENERATE_LATEX = NO +LATEX_OUTPUT = +COMPACT_LATEX = NO +PAPER_TYPE = a4wide +EXTRA_PACKAGES = +LATEX_HEADER = +PDF_HYPERLINKS = YES +USE_PDFLATEX = NO +LATEX_BATCHMODE = NO +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- +GENERATE_RTF = NO +RTF_OUTPUT = +COMPACT_RTF = NO +RTF_HYPERLINKS = YES +RTF_STYLESHEET_FILE = +RTF_EXTENSIONS_FILE = +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- +GENERATE_MAN = NO +MAN_OUTPUT = +MAN_EXTENSION = .3 +MAN_LINKS = NO +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- +GENERATE_XML = NO +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- +GENERATE_AUTOGEN_DEF = NO +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = YES +EXPAND_ONLY_PREDEF = YES +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = YES +#--------------------------------------------------------------------------- +# Configuration::addtions related to external references +#--------------------------------------------------------------------------- +TAGFILES = qtools_docs/qtools.tag=../../qtools_docs/html +GENERATE_TAGFILE = +ALLEXTERNALS = NO +PERL_PATH = /usr/bin/perl +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- +CLASS_DIAGRAMS = YES +HAVE_DOT = YES +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +TEMPLATE_RELATIONS = YES +HIDE_UNDOC_RELATIONS = YES +INCLUDE_GRAPH = YES +INCLUDED_BY_GRAPH = YES +GRAPHICAL_HIERARCHY = YES +CALL_GRAPH = YES +DOT_IMAGE_FORMAT = png +DOT_PATH = +DOTFILE_DIRS = +MAX_DOT_GRAPH_WIDTH = 1024 +MAX_DOT_GRAPH_HEIGHT = 1024 +GENERATE_LEGEND = YES +DOT_CLEANUP = YES +UML_LOOK = NO +#--------------------------------------------------------------------------- +# Configuration::addtions related to the search engine +#--------------------------------------------------------------------------- +SEARCHENGINE = NO diff --git a/libvspell/Makefile.am b/libvspell/Makefile.am index 3153c2b..9a4ea62 100644 --- a/libvspell/Makefile.am +++ b/libvspell/Makefile.am @@ -7,6 +7,7 @@ libvspell_a_SOURCES = \ distance.cpp distance.h\ spell.cpp spell.h\ wfst.cpp wfst.h\ + words.cpp sections.cpp\ debug.h INCLUDES = -I$(top_builddir)/libsrilm libvspell_a_LIBADD = $(top_builddir)/libsrilm/libsrilm.a diff --git a/libvspell/Makefile.in b/libvspell/Makefile.in index 71a3ac9..fef0bbf 100644 --- a/libvspell/Makefile.in +++ b/libvspell/Makefile.in @@ -35,7 +35,6 @@ POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : -host_triplet = @host@ ACLOCAL = @ACLOCAL@ AMDEP_FALSE = @AMDEP_FALSE@ AMDEP_TRUE = @AMDEP_TRUE@ @@ -55,7 +54,6 @@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ -ECHO = @ECHO@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ @@ -68,7 +66,6 @@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ -LIBTOOL = @LIBTOOL@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAKEINFO = @MAKEINFO@ @@ -81,6 +78,7 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ PKG_CONFIG = @PKG_CONFIG@ +POW_LIB = @POW_LIB@ RANLIB = @RANLIB@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ @@ -100,18 +98,10 @@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ bindir = @bindir@ -build = @build@ build_alias = @build_alias@ -build_cpu = @build_cpu@ -build_os = @build_os@ -build_vendor = @build_vendor@ datadir = @datadir@ exec_prefix = @exec_prefix@ -host = @host@ host_alias = @host_alias@ -host_cpu = @host_cpu@ -host_os = @host_os@ -host_vendor = @host_vendor@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ @@ -135,6 +125,7 @@ libvspell_a_SOURCES = \ distance.cpp distance.h\ spell.cpp spell.h\ wfst.cpp wfst.h\ + words.cpp sections.cpp\ debug.h INCLUDES = -I$(top_builddir)/libsrilm @@ -151,7 +142,8 @@ libvspell_a_AR = $(AR) cru libvspell_a_DEPENDENCIES = $(top_builddir)/libsrilm/libsrilm.a am_libvspell_a_OBJECTS = syllable.$(OBJEXT) dictionary.$(OBJEXT) \ propername.$(OBJEXT) sentence.$(OBJEXT) distance.$(OBJEXT) \ - spell.$(OBJEXT) wfst.$(OBJEXT) + spell.$(OBJEXT) wfst.$(OBJEXT) words.$(OBJEXT) \ + sections.$(OBJEXT) libvspell_a_OBJECTS = $(am_libvspell_a_OBJECTS) DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) @@ -159,23 +151,18 @@ depcomp = $(SHELL) $(top_srcdir)/depcomp am__depfiles_maybe = depfiles @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/dictionary.Po \ @AMDEP_TRUE@ ./$(DEPDIR)/distance.Po ./$(DEPDIR)/propername.Po \ -@AMDEP_TRUE@ ./$(DEPDIR)/sentence.Po ./$(DEPDIR)/spell.Po \ -@AMDEP_TRUE@ ./$(DEPDIR)/syllable.Po ./$(DEPDIR)/wfst.Po +@AMDEP_TRUE@ ./$(DEPDIR)/sections.Po ./$(DEPDIR)/sentence.Po \ +@AMDEP_TRUE@ ./$(DEPDIR)/spell.Po ./$(DEPDIR)/syllable.Po \ +@AMDEP_TRUE@ ./$(DEPDIR)/wfst.Po ./$(DEPDIR)/words.Po CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -LTCXXCOMPILE = $(LIBTOOL) --mode=compile $(CXX) $(DEFS) \ - $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ - $(AM_CXXFLAGS) $(CXXFLAGS) CXXLD = $(CXX) -CXXLINK = $(LIBTOOL) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \ - $(AM_LDFLAGS) $(LDFLAGS) -o $@ +CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \ + -o $@ COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ - $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) CCLD = $(CC) -LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ - $(AM_LDFLAGS) $(LDFLAGS) -o $@ +LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ DIST_SOURCES = $(libvspell_a_SOURCES) DATA = $(data_DATA) @@ -185,7 +172,7 @@ SOURCES = $(libvspell_a_SOURCES) all: all-am .SUFFIXES: -.SUFFIXES: .cpp .lo .o .obj +.SUFFIXES: .cpp .o .obj $(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) cd $(top_srcdir) && \ $(AUTOMAKE) --gnu libvspell/Makefile @@ -237,10 +224,12 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dictionary.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/distance.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/propername.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sentence.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/spell.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/syllable.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wfst.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/words.Po@am__quote@ distclean-depend: -rm -rf ./$(DEPDIR) @@ -266,26 +255,6 @@ distclean-depend: @AMDEP_TRUE@@am__fastdepCXX_FALSE@ depfile='$(DEPDIR)/$*.Po' tmpdepfile='$(DEPDIR)/$*.TPo' @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` - -.cpp.lo: -@am__fastdepCXX_TRUE@ if $(LTCXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" \ -@am__fastdepCXX_TRUE@ -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$<; \ -@am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Plo"; \ -@am__fastdepCXX_TRUE@ else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; \ -@am__fastdepCXX_TRUE@ fi -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ depfile='$(DEPDIR)/$*.Plo' tmpdepfile='$(DEPDIR)/$*.TPlo' @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(LTCXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< - -mostlyclean-libtool: - -rm -f *.lo - -clean-libtool: - -rm -rf .libs _libs - -distclean-libtool: - -rm -f libtool uninstall-info-am: dataDATA_INSTALL = $(INSTALL_DATA) install-dataDATA: $(data_DATA) @@ -423,12 +392,12 @@ maintainer-clean-generic: @echo "it deletes files that may require special tools to rebuild." clean: clean-am -clean-am: clean-generic clean-libLIBRARIES clean-libtool mostlyclean-am +clean-am: clean-generic clean-libLIBRARIES mostlyclean-am distclean: distclean-am distclean-am: clean-am distclean-compile distclean-depend \ - distclean-generic distclean-libtool distclean-tags + distclean-generic distclean-tags dvi: dvi-am @@ -454,8 +423,7 @@ maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-am -mostlyclean-am: mostlyclean-compile mostlyclean-generic \ - mostlyclean-libtool +mostlyclean-am: mostlyclean-compile mostlyclean-generic pdf: pdf-am @@ -469,15 +437,14 @@ uninstall-am: uninstall-dataDATA uninstall-info-am \ uninstall-libLIBRARIES .PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ - clean-libLIBRARIES clean-libtool ctags distclean \ - distclean-compile distclean-depend distclean-generic \ - distclean-libtool distclean-tags distdir dvi dvi-am info \ - info-am install install-am install-data install-data-am \ - install-dataDATA install-exec install-exec-am install-info \ - install-info-am install-libLIBRARIES install-man install-strip \ - installcheck installcheck-am installdirs maintainer-clean \ - maintainer-clean-generic mostlyclean mostlyclean-compile \ - mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + clean-libLIBRARIES ctags distclean distclean-compile \ + distclean-depend distclean-generic distclean-tags distdir dvi \ + dvi-am info info-am install install-am install-data \ + install-data-am install-dataDATA install-exec install-exec-am \ + install-info install-info-am install-libLIBRARIES install-man \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-compile mostlyclean-generic pdf pdf-am ps ps-am \ tags uninstall uninstall-am uninstall-dataDATA \ uninstall-info-am uninstall-libLIBRARIES diff --git a/libvspell/dictionary.cpp b/libvspell/dictionary.cpp index b27fec4..e3330f8 100644 --- a/libvspell/dictionary.cpp +++ b/libvspell/dictionary.cpp @@ -1,5 +1,6 @@ #include "config.h" // -*- tab-width: 2 -*- #include +#include #include #include #include @@ -7,7 +8,6 @@ #include "dictionary.h" #include "wordnode.h" #include "distance.h" -#include #include #ifndef _SArray_cc_ #include @@ -169,8 +169,12 @@ bool WordNode::load(const char* filename) int i,nr_syllables = syllables.size(); node->info->syllables = new VocabIndex[nr_syllables+1]; node->info->syllables[nr_syllables] = Vocab_None; - for (int i = 0;i < nr_syllables;i ++) + //cerr << "Word " << toks[0] << "(" << node->info->id << "| "; + for (int i = 0;i < nr_syllables;i ++) { node->info->syllables[i] = syllables[i]; + //cerr << node->info->syllables[i] << ","; + } + //cerr << endl; node->set_prob(atof(toks[2].c_str())); if (csyllables != syllables) { @@ -259,7 +263,7 @@ void WordNode::set_prob(float _prob) } -int WordNode::get_syllable_count() +int WordNode::get_syllable_count() const { ASSERT(info != NULL); int n; @@ -267,11 +271,11 @@ int WordNode::get_syllable_count() return n; } -void WordNode::get_syllables(vector &syllables) +void WordNode::get_syllables(vector &syllables) const { - // ASSERT: info != NULL + ASSERT(info != NULL); int n; - for (n = 0;info->syllables[n] != 0;n ++) + for (n = 0;info->syllables[n] != Vocab_None;n ++) syllables.push_back(info->syllables[n]); } @@ -358,6 +362,7 @@ bool FuzzyWordNode::fuzzy_get_next_with_syllable(strid str, } +// the get_next()'s result must be included. bool FuzzyWordNode::fuzzy_get_next(strid str, vector& _nodes) const { @@ -425,8 +430,11 @@ strid StringArchive::operator[] (VocabString s) return vi; if (blocked) { vi = rest->getIndex(s); - if (vi == Vocab_None) - return rest->addWord(s)+dict.numWords(); + if (vi == Vocab_None) { + int i = rest->addWord(s)+dict.numWords(); + cerr << "New word " << s << " as " << i << endl; + return i; + } return vi+dict.numWords(); } return dict.addWord(s); @@ -477,6 +485,31 @@ strpair make_strpair(strid str) pair.cid = sarch[st]; return pair; } + +void WordNode::dump_next(std::ostream &os) const +{ + node_map_iterator iter(*nodes); + strid id; + WordNodePtr *pnode; + while ((pnode = iter.next(id))) { + os << (*pnode)->id << " "; + } +} + +std::ostream& operator << (std::ostream &os,const WordNode::DistanceNode &dnode) +{ + os << *dnode.node; + return os; +} +std::ostream& operator << (std::ostream &os,const WordNode &node) +{ + std::vector syll; + node.get_syllables(syll); + for (int i = 0;i < syll.size();i ++) + os << sarch[syll[i]] << "(" << syll[i] << ") "; + return os; +} + /* } */ diff --git a/libvspell/posgen.h b/libvspell/posgen.h new file mode 100644 index 0000000..678abbd --- /dev/null +++ b/libvspell/posgen.h @@ -0,0 +1,33 @@ +#ifndef __POSGEN_H__ // -*- tab-width: 2 mode: c++ -*- +#define __POSGEN_H__ + +#ifndef __SPELL_H__ +#include "spell.h" +#endif + +#ifndef __VECTOR__ +#include +#endif + +/** + Position generator. + */ + +class Generator +{ +private: + int ii,i,nr_misspelled; + std::vector misspelled_pos; + bool run; + + int len; + SentenceRef st; + +public: + void init(const Sentence &st); + bool step(std::vector &pos,int &len); + void done(); +}; + + +#endif diff --git a/libvspell/sections.cpp b/libvspell/sections.cpp new file mode 100644 index 0000000..7a9b5bf --- /dev/null +++ b/libvspell/sections.cpp @@ -0,0 +1,116 @@ +#include "config.h" // -*- tab-width: 2 -*- +#include "wfst.h" +#include +#include +#include +#include +#include +#include +#include + + +using namespace std; + +extern int ngram_length; + +/** + * dump a Sections + */ + +std::ostream& operator << (std::ostream &os,const Sections &me) +{ + using namespace boost; + const Sentence &_sent = *me.st; + int i,ii,nn,n = _sent.get_syllable_count(); + ii = 0; + nn = me.size(); + for (i = 0;i < n;i ++) { + if (ii < nn && me[ii].start == i) + os << "["; + os << format("%s") % sarch[_sent[i].id]; + if (ii < nn && me[ii].start+me[ii].len-1 == i) { + os << "]"; + ii ++; + } + os << " "; + } + return os; +} + +/** + Split Words into Sections + \param words input + \param sects output + */ + +void Sections::construct(const Words &words) +{ + Sections& sects = *this; + + // mark all possible words. All bounds left is really bounds. + // because we need at most n-1 boundary syllable for n-gram + // if two ambiguous sections is near than n-1 syllables, then merge them. + int i,ii,n,nn; + + sects.st = words.st; + + n = words.get_word_count(); + + vector bound(n); + + for (i = 0;i < n;i ++) { + nn = words.get_len(i); + for (ii = 0;ii < nn-2;ii ++) + bound[ii+i] = 1; + } + bound[bound.size()-1] = 1; // it's obvious there is a boundary in the end + + //copy(bound.begin(),bound.end(),ostream_iterator(cerr," ")); + + // "tokenize" _sent + int pos,len = bound.size(); + Section sect; + + sect.start = 0; + sect.len = 0; + + for (pos = 0;pos < len;pos ++) { + // ignore "1" boundaries + if (bound[pos]) + continue; + + bool is_section; + // just write down and figure out what the formulas mean + sect.len = pos - sect.start + 1; + is_section = words.get_len(sect.start) > 2 || + (words.get_len(sect.start) >= 2 && + words.get_fuzzy_count(sect.start,1) > 0); + + if (is_section) { + // now merge two sections (this and the previous) if needed + if (!sects.empty()) { + Section &prev = sects.back(); + if (sect.start - (prev.start + prev.len) < ngram_length-1) + prev.len = pos - prev.start + 1; // merge + else + sects.push_back(sect); // not merge + } else + sects.push_back(sect); // sects is empty -> nothing to merge + } + sect.start = pos+1; + } + + if (sects.empty()) { + sect.start=0; + sect.len = n-sect.start; + sects.push_back(sect); + } +} + +std::ostream& operator <<(std::ostream &os,const Segmentation &seg) +{ + int i,n = seg.size(); + for (i = 0;i < n;i ++) + os << "[" << seg[i].node << "] "; + return os; +} diff --git a/libvspell/sentence.cpp b/libvspell/sentence.cpp index d6e7b1a..757a9c6 100644 --- a/libvspell/sentence.cpp +++ b/libvspell/sentence.cpp @@ -1,4 +1,5 @@ -#include "sentence.h" +#include "sentence.h" // -*- tab-width: 2 -*- +#include "spell.h" using namespace std; void sentences_split(const string &_input,vector &output) @@ -36,3 +37,133 @@ void sentences_split(const string &_input,vector &output) if (!input.empty()) output.push_back(input); } + +void Sentence::standardize() +{ +} + +/** + Split punctiations off the token + \param s input token + \param ret a sequence of tokens +*/ + +void Sentence::tokenize_punctuation(const string &s,vector &ret) +{ + int npos,pos = 0,start = 0; + int len = s.size(); + while (start < len) { + if (pos < len) { + npos = s.find_first_of("!#()'\";:.,?/",pos); + if (npos == string::npos) + npos = len; + pos = npos+1; + if (npos < len) { // TODO: some checks here + // floating point + if ((s[npos] == '.' || s[npos] == ',') && + (npos+1 < len && s[npos+1] >= '0' && s[npos+1] <= '9')) + continue; // skip the dot/comma + + // date + if ((s[npos] == '/') && + (npos+1 < len && s[npos+1] >= '0' && s[npos+1] <= '9')) + continue; + + // only split dot when it's in the end. + if (s[npos] == '.' && npos+1 != len) + continue; + } + } else + npos = len; + + ret.push_back(s.substr(start,npos-start)); + if (npos < len) + ret.push_back(s.substr(npos,1)); + start = npos+1; + } +} + +/** + Convert a string to a sequence of token +*/ + +void Sentence::tokenize() +{ + string &st = sent_; + int pos = 0,len = st.size(); + bool started = false; // we have found start of a new word + Syllable sy; + sy.span = 1; + sy.sent_ = this; + sy.sid = -1; + + while (pos < len) { + if (started) { + pos = st.find(' ',sy.start); + if (pos == string::npos) + pos = len; + vector ss; + int i,n; + tokenize_punctuation(st.substr(sy.start,pos-sy.start),ss); + n = ss.size(); + for (i = 0;i < n;i ++) { + string &s = ss[i]; + sy.id = sarch[s]; + transform(s.begin(),s.end(),s.begin(),viet_tolower); + sy.cid = sarch[s]; + syllables.push_back(sy); + sy.start += s.size(); + } + started = false; + } else { + if (st[pos] == ' ') + pos++; + else { + started = true; + sy.start = pos; + } + } + } +} + + +/** + Dump a Sentence +*/ + +ostream& operator <<(ostream &os, const Sentence &st) +{ + int cc,i,n = st.get_syllable_count(); + /* + for (i = 0;i < n;i ++) { + cout << *st[i] << " "; + if (items[i].flags & SEGM_SEPARATOR) + cout << "| "; + } + cout << endl; + */ + /* + for (cc = i = 0;i < n;i ++) { + int c = st.words[st[i]].node(st).node->get_syllable_count(); + for (int k = 0;k < c;k ++) { + if (k) os << "_"; + os << sarch[st[cc+k].get_id()]; + } + cc += c; + os << " "; + } + os << prob << endl; + */ +} + +/* + std::string& Sentence::const_iterator::operator++() + { + } + + std::string Sentence::const_iterator::operator++(int) + { + } +*/ + + diff --git a/libvspell/spell.cpp b/libvspell/spell.cpp index a64c97f..f51ed33 100644 --- a/libvspell/spell.cpp +++ b/libvspell/spell.cpp @@ -54,12 +54,12 @@ void spell_check1(Sentence &st,Suggestions &sugg) void spell_check2(Sentence &st,Segmentation &seg,Suggestions &sugg) { - int i,n = seg.items.size(); + int i,n = seg.size(); int cc = 0; for (i = 0;i < n;i ++) { vector sylls; - int len = seg.items[i].state->get_syllable_count(); + int len = seg[i].node->get_syllable_count(); if (len == 1) { cc += len; continue; diff --git a/libvspell/spell.h b/libvspell/spell.h index 28ca0b7..d0b4121 100644 --- a/libvspell/spell.h +++ b/libvspell/spell.h @@ -5,6 +5,10 @@ #include "dictionary.h" #endif +#ifndef __WORDNODE_H__ +#include "wordnode.h" +#endif + #ifndef __VECTOR__ #include #endif @@ -12,6 +16,173 @@ #include #endif +class Sentence; + +/** + Store information of a word in a sentence. + */ + +struct WordEntry { + int pos; /// syllable index + int len; /// word len + int fuzid; /// to identify different WordEntry with the same pos/len. + /// fuzid is a mask of fuzzy/exact match. + int id; /// index in WordEntries + + WordNode::DistanceNode node; /// Word content + + bool operator < (const WordEntry &we) const { + return pos != we.pos ? pos < we.pos : + (len != we.len ? len < we.len : + (fuzid != we.fuzid ? fuzid < we.fuzid : node.node < we.node.node)); + } +}; + +std::ostream& operator << (std::ostream &os,const WordEntry &we); + +typedef WordEntry* WordEntryRef; +typedef std::vector WordEntries; +typedef std::vector WordEntryRefs; +//struct WordEntries:public std::vector { +//}; + +/** + Store all WordEntry pointers which started at a specified pos, + and have specified len + */ + +struct WordInfo { + WordEntry* exact_match; + WordEntryRefs fuzzy_match; +}; + +/** + Store WordInfo(s) which have a specified length + */ + +class WordInfos : public std::vector { +public: + int exact_len; + WordEntryRefs fuzzy_map; /// contains all WordEntry which are fuzzy at this position + WordEntryRefs we; /// contains all WordEntry which started at this pos +}; + +/** + Store WordInfos(s) started at specified positions. + */ + +class Words:public std::vector { +public: + + WordEntries *we; + Sentence *st; + + void construct(const Sentence &st); + + /// Get the number of available positions, from 0 to n-1 + int get_word_count() const { + return size(); + } + + /** + Get maximal length of words at specified position. + \param i specify a position in sentence + */ + int get_len(int i) const { + return (*this)[i]->size(); + } + + /** + Get the length of the exact words at specified position. + \param i specify a position in sentence + */ + int get_exact_len(int i) const { + return (*this)[i]->exact_len; + } + + /** + Get fuzzy map at specified position. + \param i specify a position + */ + + const WordEntryRefs& get_fuzzy_map(int i) const { + return (*this)[i]->fuzzy_map; + } + + /** + Get number of fuzzy words at specified (pos,len). + \param i specify pos + \param l specify len + */ + int get_fuzzy_count(int i,int l) const { + return (*(*this)[i])[l]->fuzzy_match.size(); + } + + /** + Get the first WordEntry at specified pos. + \param pos specify a position. + */ + + WordEntryRefs& get_we(int pos) { + return (*this)[pos]->we; + } + + const WordEntry& get_we_fuzzy(int i,int l,int f) const { + return *(*(*this)[i])[l]->fuzzy_match[f]; + } + + const WordEntry* get_we_exact(int i,int l) const { + return (*(*this)[i])[l]->exact_match; + } + + /** + Get the Node of a specified fuzzy match (pos,len,fuz) + \param i is pos + \param l is len + \param f is fuz + */ + + WordNode::DistanceNode& get_fuzzy(int i,int l,int f) { + return (*(*this)[i])[l]->fuzzy_match[f]->node; + } + + /// a const version of get_fuzzy() + const WordNode::DistanceNode& get_fuzzy(int i,int l,int f) const{ + return (*(*this)[i])[l]->fuzzy_match[f]->node; + } + + ~Words(); // WARN: destroy all. + + /** + Construct Words based on member we. + we must be valid. + */ + + void construct(); + + /** + Construct Words based on another Words. + Only exact matches are copied. + \param w specify the "template" Words + */ + + void based_on(const Words &w); + + /** + Add WordEntry w into Words. + */ + + void add(WordEntry &w); + + friend std::ostream& operator << (std::ostream& os,const Words &w); +}; +// Words[from][len].fuzzy_match[i] + +/** + Sentence is used to store a sequence of syllables. + Sentence and Words will keep all necessary info for a spelling checker. + */ + class Sentence { public: @@ -51,26 +222,31 @@ public: // Syllable& operator[] (int i) { return syllables[i]; } bool is_contiguous(int i); // i & i+1 is contiguous ? void merge(int i); + }; -struct Segmentation -{ - struct Item { - int flags; // Separator mark - int distance; // from ed() or fuzzy syllable - WordNodePtr state; // used to get prob. +typedef Sentence* SentenceRef; - Item():flags(0),distance(0),state(NULL) {} - }; +/** + Segmentation store a sequence of WordEntry index. + From WordEntry we can get the real word. + */ - std::vector items; - float prob; // total prob - int distance; // total distance +struct Segmentation : public std::vector +{ + WordEntries *we; /// WordEntries associated with Segmentation + float prob; /// total prob + int distance; /// total distance Segmentation():prob(0),distance(0) {} - void print(std::ostream &os,const Sentence &st); + const WordEntry& operator[] (int id) const { + return (*we)[std::vector::operator[](id)]; + } + friend std::ostream& operator <<(std::ostream &os,const Segmentation &seg); }; +typedef std::vector Segmentations; + struct Suggestion { int id; std::vector suggestions; diff --git a/libvspell/syllable.h b/libvspell/syllable.h index 78f2220..a66d389 100644 --- a/libvspell/syllable.h +++ b/libvspell/syllable.h @@ -47,7 +47,7 @@ public: void standardize(std::string str); void print(); strid to_id(); - string to_str(); + std::string to_str(); }; extern char *vowels[]; diff --git a/libvspell/wfst.cpp b/libvspell/wfst.cpp dissimilarity index 83% index 702e2dd..db47658 100644 --- a/libvspell/wfst.cpp +++ b/libvspell/wfst.cpp @@ -1,454 +1,346 @@ -#include "config.h" // -*- tab-width: 2 -*- -#include "wfst.h" -#include -#include -#include -#include -#include - -using namespace std; - - -void Sentence::standardize() -{ -} - -void Sentence::tokenize_punctuation(const string &s,vector &ret) -{ - int npos,pos = 0,start = 0; - int len = s.size(); - while (start < len) { - if (pos < len) { - npos = s.find_first_of("!#()'\";:.,?/",pos); - if (npos == string::npos) - npos = len; - pos = npos+1; - if (npos < len) { // TODO: some checks here - // floating point - if ((s[npos] == '.' || s[npos] == ',') && - (npos+1 < len && s[npos+1] >= '0' && s[npos+1] <= '9')) - continue; // skip the dot/comma - - // date - if ((s[npos] == '/') && - (npos+1 < len && s[npos+1] >= '0' && s[npos+1] <= '9')) - continue; - - // only split dot when it's in the end. - if (s[npos] == '.' && npos+1 != len) - continue; - } - } else - npos = len; - - ret.push_back(s.substr(start,npos-start)); - if (npos < len) - ret.push_back(s.substr(npos,1)); - start = npos+1; - } -} - -void Sentence::tokenize() -{ - string &st = sent_; - int pos = 0,len = st.size(); - bool started = false; // we have found start of a new word - Syllable sy; - sy.span = 1; - sy.sent_ = this; - sy.sid = -1; - - while (pos < len) { - if (started) { - pos = st.find(' ',sy.start); - if (pos == string::npos) - pos = len; - vector ss; - int i,n; - tokenize_punctuation(st.substr(sy.start,pos-sy.start),ss); - n = ss.size(); - for (i = 0;i < n;i ++) { - string &s = ss[i]; - sy.id = sarch[s]; - transform(s.begin(),s.end(),s.begin(),viet_tolower); - sy.cid = sarch[s]; - syllables.push_back(sy); - sy.start += s.size(); - } - started = false; - } else { - if (st[pos] == ' ') - pos++; - else { - started = true; - sy.start = pos; - } - } - } -} - -/* - std::string& Sentence::const_iterator::operator++() - { - } - - std::string Sentence::const_iterator::operator++(int) - { - } -*/ - -class SegmentationComparator -{ -public: - bool operator() (const Segmentation &seg1,const Segmentation &seg2) { - return seg1.prob > seg2.prob; - } -}; - -void WFST::segment_best(const Sentence &_sent, - const Words &words, - Segmentation &seps) -{ - vector segms; - - // Try to split _sent into small parts - // mark all multiple-syllable words - // those leaved unmarked are obviously word-segmented correctly - vector marks(_sent.get_syllable_count()); - vector bound(_sent.get_syllable_count()); - int i,ii,n,nn; - n = words.get_word_count(); - for (i = 0;i < n;i ++) { - nn = words.get_len(i); - if (nn > 1) - for (ii = 0;ii < nn;ii ++) - marks[i+ii] = 1; - for (ii = 0;ii < nn-1;ii ++) - bound[ii+i] = 1; - } - - for (i = 0;i < n;i ++) { - printf("%s",sarch[_sent[i].id]); - if (i+1get_prob(); - int len = seps.items.size(); - if (len > 1 && ngram_enabled) { - VocabIndex vi[2]; - vi[0] = seps.items[len-1].state->get_id(); - vi[1] = Vocab_None; - seps.prob += -ngram.wordProb(seps.items[len-2].state->get_id(),vi); - } - seps.distance += item.distance; - pos++; - } else { - start = pos; - while (pos < len && marks[pos] == 1) pos ++; - segms.clear(); - segment_all1(_sent,words,start,pos,segms); - if (!segms.empty()) { - int len = seps.items.size(); - if (len > 0 && ngram_enabled) { - VocabIndex vi[2]; - vi[0] = segms[0].items[0].state->get_id(); - vi[1] = Vocab_None; - seps.prob += -ngram.wordProb(seps.items[len-1].state->get_id(),vi); - } - seps.items.insert(seps.items.end(), - segms[0].items.begin(), - segms[0].items.end()); - seps.prob += segms[0].prob; - seps.distance += segms[0].distance; - } - } - } - - /* - vector::iterator i; - for (i = segms.begin();i != segms.end(); ++i) - i->print(_sent); - */ -} - - -// find all possible words -// words must be cleared before calling this function -void WFST::get_all_words(const Sentence &sent,Words &words) -{ - int i,n,ii,nn,k; - vector states,old_states; - vector syll; - - states.reserve(10); - old_states.reserve(10); - - n = sent.get_syllable_count(); - - syll.resize(n); - for (i = 0;i < n;i ++) - syll[i] = sent[i].get_cid(); - - for (i = 0;i < n;i ++) { - - // make a word - words.push_back(new WordInfos()); - WordInfos &winfos = *words.back(); - - // finding all possible words started by the i-th syllable. - - // init states with root node - states.clear(); - states.push_back(WordNode::DistanceNode(wl)); - - k = 0; - while (!states.empty() && i+k < n) { - - // make a winfo - winfos.push_back(new WordInfo()); - WordInfo &winfo = *winfos.back(); - - // save states - old_states = states; - - // clear states, ready for new states - states.clear(); - - // get next new states -> states - nn = old_states.size(); - for (ii = 0;ii < nn;ii ++) { - WordNode::DistanceNode &state = old_states[ii]; - WordNodePtr exact_node = state.node->get_next(syll[i+k]); - if (exact_node) states.push_back(exact_node); - state.node->fuzzy_get_next(syll[i+k],states); - } - - // fill winfo - nn = states.size(); - for (ii = 0;ii < nn;ii ++) - if (states[ii].node->get_prob() >= 0) { - winfo.fuzzy_match.push_back(states[ii]); - } - - // unknown word - if (k == 0 && winfo.fuzzy_match.empty()) { - WordNode::DistanceNode dn; - dn.node = wl->get_next(unk_id); - dn.distance = 0; - winfo.fuzzy_match.push_back(dn); - } - - // remove duplicated items - sort(winfo.fuzzy_match.begin(),winfo.fuzzy_match.end()); - winfo.fuzzy_match.erase(unique(winfo.fuzzy_match.begin(), - winfo.fuzzy_match.end()), - winfo.fuzzy_match.end()); - - k ++; - } - // remove the last if it is empty - while (!winfos.empty() && winfos.back()->fuzzy_match.empty()) - winfos.pop_back(); - } - -} - -void Words::print() const -{ - int i, nn = get_word_count(); - for (i = 0;i < nn;i ++) { - int nnn = get_len(i); - cerr << "From " << i << endl; - for (int ii = 0;ii < nnn;ii ++) { - int nnnn = get_fuzzy_count(i,ii); - cerr << "Len " << ii << endl; - for (int iii = 0;iii < nnnn;iii ++) { - cerr << get_fuzzy(i,ii,iii).node->get_id( ) << "-" << sarch[get_fuzzy(i,ii,iii).node->get_id()] << " "; - cerr << get_fuzzy(i,ii,iii).node->get_syllable_count() << " "; - cerr << get_fuzzy(i,ii,iii).distance << " "; - cerr << get_fuzzy(i,ii,iii).node->get_prob() << endl; - } - } - } -} - -// WFST (Weighted Finite State Transducer) implementation -// TUNE: think about genetic/greedy. Vietnamese is almost one-syllable words.. -// we find where is likely a start of a word, then try to construct word -// and check if others are still valid words. - -// the last item is always the incompleted item. We will try to complete -// a word from the item. If we reach the end of sentence, we will remove it -// from segs - -struct Trace -{ - Segmentation s; - int next_syllable; - Trace():next_syllable(0) {} -}; - -void WFST::segment_all(const Sentence &sent,vector &result) -{ - Words words; - get_all_words(sent,words); - segment_all1(sent,words,0,sent.get_syllable_count(),result); - /* - int nn = words.size(); - for (i = 0;i < nn;i ++) { - int nnn = words[i].size(); - cerr << "From " << i << endl; - for (int ii = 0;ii < nnn;ii ++) { - int nnnn = words[i][ii].fuzzy_match.size(); - cerr << "Len " << ii << endl; - for (int iii = 0;iii < nnnn;iii ++) { - cerr << words[i][ii].fuzzy_match[iii].distance << " "; - cerr << words[i][ii].fuzzy_match[iii].node->get_prob() << endl; - } - } - } - */ - -} - -void WFST::segment_all1(const Sentence &sent, - const Words &words, - int from,int to, - std::vector &result) -{ - int nr_syllables = to; - vector segs; // used to store all incomplete seg - int i; - int count = 0; - Trace trace; - - trace.next_syllable = from; - //Segmentation start_seg; - segs.reserve(100); - segs.push_back(trace); // empty seg - - while (!segs.empty()) { - // get one - trace = segs.back(); - segs.pop_back(); - Segmentation seg = trace.s; - int next_syllable = trace.next_syllable; -// cerr << segs.size() << " " << count << endl; - - // segmentation completed. throw it away - // TUNE: do we need these? -> yes we do - // in case of bad/incomplete segmentations - if (next_syllable == nr_syllables) - continue; - - int nr_size_1 = words.get_len(next_syllable); - - // word with the length of (size_1+1) syllables - for (int size_1 = 0;size_1 < nr_size_1; size_1 ++) { - int nr_fuzzy = words.get_fuzzy_count(next_syllable,size_1); - - // number of words - for (i = 0;i < nr_fuzzy;i ++) { - const WordNode::DistanceNode &next_state = - words.get_fuzzy(next_syllable,size_1,i); - - // New segmentation for longer incomplete word - Trace newseg; - newseg = trace; - newseg.s.items.push_back(Segmentation::Item()); - Segmentation::Item &item = newseg.s.items.back(); - item.state = next_state.node; // move to the next node - item.distance = 0/*FIXME*/; - item.flags |= SEGM_SEPARATOR; - - // Segmentation's parameters - newseg.s.distance += item.distance; - newseg.s.prob += item.state->get_prob(); - int len = newseg.s.items.size(); - if (len > 1 && ngram_enabled) { - VocabIndex vi[2]; - vi[0] = newseg.s.items[len-1].state->get_id(); - vi[1] = Vocab_None; - newseg.s.prob += -ngram.wordProb(newseg.s.items[len-2].state->get_id(),vi); - } - newseg.next_syllable += size_1+1; - if (newseg.next_syllable == nr_syllables) { - result.push_back(newseg.s); - push_heap(result.begin(),result.end(),SegmentationComparator()); - //cerr << "New: ";newseg.s.print(sent); - //cerr << "Candidate: ";result[0].print(sent); - count ++; - } else { - segs.push_back(newseg); - //cerr << "New seg: "; - //newseg.print(sent); - } - } - } - } // end while -} - - -void Segmentation::print(ostream &os, const Sentence &st) -{ - int cc,i,n = items.size(); - /* - for (i = 0;i < n;i ++) { - cout << *st[i] << " "; - if (items[i].flags & SEGM_SEPARATOR) - cout << "| "; - } - cout << endl; - */ - for (cc = i = 0;i < n;i ++) { - int c = items[i].state->get_syllable_count(); - for (int k = 0;k < c;k ++) { - if (k) os << "_"; - os << sarch[st[cc+k].get_id()]; - } - cc += c; - os << " "; - } - os << prob << endl; -} - -Words::~Words() -{ - int pos,nr_pos = get_word_count(); - for (pos = 0;pos < nr_pos;pos ++) { - int len,nr_len = get_len(pos); - for (len = 0;len < nr_len;len ++) - delete (*(*this)[pos])[len]; - delete (*this)[pos]; - } -} +#include "config.h" // -*- tab-width: 2 -*- +#include "wfst.h" +#include +#include +#include +#include +#include +#include +#include +#include "posgen.h" + +using namespace std; + +int ngram_length = 2; // for lazy men ;) + + +/* + Create a "template" segmentation. Mark all sections in template to sects + */ +/* +void WFST::create_base_segmentation(const Words &words, + Sections §s, + Segmentation &seg) +{ + int i,ii,n,nn; + i = ii = 0; + n = words.get_word_count(); + nn = sects.size(); + + while (i < n) + if (ii < nn && sects[ii].start == i) { + seg.items.push_back(Segmentation::Item()); + sects[ii].segment = seg.items.size()-1; + ii ++; + i += sects[ii].len; + } else { + // only word(i,0,0) exists + seg.items.push_back(Segmentation::Item()); + seg.items.back().pos = i; + i ++; + } +} +*/ + +void Generator::init(const Sentence &_st) +{ + nr_misspelled = 3; // REMEMBER THIS NUMBER + misspelled_pos.resize(nr_misspelled); + run = true; + + st = &(Sentence&)_st; + + len = st->get_syllable_count(); + + // initialize + for (i = 0;i < nr_misspelled;i ++) + misspelled_pos[i] = i; + + i = 0; +} + +void Generator::done() +{ +} + +/** + Generate every possible 3-misspelled-positions. + Then call WFST::generate_misspelled_words. + */ + +bool Generator::step(vector &_pos,int &_len) +{ + while (run) { + + // move to the next counter + if (i+1 < nr_misspelled && misspelled_pos[i] < len) { + i ++; + + // do something here with misspelled_pos[i] + _pos = misspelled_pos; + _len = i; + return true; + } + + // the last counter is not full + if (misspelled_pos[i] < len) { + // do something here with misspelled_pos[nr_misspelled] + _pos = misspelled_pos; + _len = nr_misspelled; + misspelled_pos[i] ++; + return true; + } + + // the last counter is full, step back + while (i >= 0 && misspelled_pos[i] == len) i --; + if (i < 0) + run = false; + else { + misspelled_pos[i] ++; + for (ii = i+1;ii < nr_misspelled;ii ++) + misspelled_pos[ii] = misspelled_pos[ii-1]+1; + } + } + return false; +} + +/** + Generate new Words info based on misspelled position info. + Call WFST::get_sections() to split into sections + then call WFST::segment_all1() to do real segmentation. + \param pos contains possibly misspelled position. + \param len specify the actual length pos. Don't use pos.size() +*/ +void WFST::generate_misspelled_words(const vector &pos,int len) +{ + const Words &words = *p_words; + Words w; + + w.based_on(words); + + // 2. Compute the score, jump to 1. if score is too low (pruning 1) + + // create new (more compact) Words structure + int i,j,n = p_st->get_syllable_count(); + for (i = 0;i < len;i ++) { + const WordEntryRefs &fmap = words.get_fuzzy_map(pos[i]); + WordEntryRefs::const_iterator iter; + for (iter = fmap.begin();iter != fmap.end(); ++iter) + w.add(**iter); + } + + //cerr << w << endl; + + // 4. Call get_sections + Sections sects; + sects.construct(words); + + // 5. Call create_base_segmentation + //Segmentation base_seg; + //create_base_segmentation(words,sects,base_seg); + + // 6. Call segment_all1 for each sections. + n = sects.size(); + for (i = 0;i < n;i ++) { + Segmentation seg; + Segmentor segtor; + + segtor.init(*p_st, // Sentence + w, // Words + sects[i].start, // from + sects[i].start+sects[i].len-1); // to + + while (segtor.step(seg)) { + // compute ngram. take the best seg. + seg.prob = 0; + VocabIndex *vi = new VocabIndex[ngram_length]; + vi[ngram_length] = Vocab_None; + for (int i = ngram_length-1;i < seg.size();i ++) { + for (int j = 0;j < ngram_length-1;j++) + vi[j] = seg[i-1-j].node.node->get_id(); + seg.prob += -ngram.wordProb(seg[i].node.node->get_id(),vi); + } + + cout << seg << " " << seg.prob << endl; + // merge seg to base seg. + + // 6.1. Recompute the score after each section processed. (pruning 2) + } + + } + + +} + +/** + Create the best segmentation for a sentence. + \param _sent specify the sentence + \param words store Words info + \param seps return the best segmentation + */ + +void WFST::segment_best(const Sentence &_sent, + const Words &words, + Segmentation &seps) +{ + int i,ii,n,nn; + + p_st = &_sent; + p_words = &words; + + // in test mode, generate all positions where misspelled can appear, + // then create a new Words for them, re get_sections, + // create_base_segmentation and call segment_all1 for each sections. + + // 1. Generate mispelled positions (pruning 0 - GA) + // 2. Compute the score, jump to 1. if score is too low (pruning 1) + // 3. Make a new Words based on the original Words + // 4. Call get_sections + // 5. Call create_base_segmentation + // 6. Call segment_all1 for each sections. + // 6.1. Recompute the score after each section processed. (pruning 2) + + // 1. Bai toan hoan vi, tinh chap C(k,n) with modification. C(1,n)+C(2,n)+...+C(k,n) + Generator gen; + + gen.init(_sent); + vector pos; + int len; + while (gen.step(pos,len)) { + cerr << "POS :"; + for (int i = 0;i < len;i ++) cerr << pos[i]; + cerr << endl; + generate_misspelled_words(pos,len); + } + gen.done(); + +} + +// WFST (Weighted Finite State Transducer) implementation +// TUNE: think about genetic/greedy. Vietnamese is almost one-syllable words.. +// we find where is likely a start of a word, then try to construct word +// and check if others are still valid words. + +// the last item is always the incompleted item. We will try to complete +// a word from the item. If we reach the end of sentence, we will remove it +// from segs + +// obsolete +void WFST::segment_all(const Sentence &sent,vector &result) +{ + Words words; + words.construct(sent); + // segment_all1(sent,words,0,sent.get_syllable_count(),result);a + /* + int nn = words.size(); + for (i = 0;i < nn;i ++) { + int nnn = words[i].size(); + cerr << "From " << i << endl; + for (int ii = 0;ii < nnn;ii ++) { + int nnnn = words[i][ii].fuzzy_match.size(); + cerr << "Len " << ii << endl; + for (int iii = 0;iii < nnnn;iii ++) { + cerr << words[i][ii].fuzzy_match[iii].distance << " "; + cerr << words[i][ii].fuzzy_match[iii].node->get_prob() << endl; + } + } + } + */ + +} + +/** + * Segmentation comparator + */ + +class SegmentationComparator +{ +public: + bool operator() (const Segmentation &seg1,const Segmentation &seg2) { + return seg1.prob > seg2.prob; + } +}; + + + + + + +void Segmentor::init(const Sentence &sent, + Words &words, + int from, + int to) +{ + nr_syllables = to+1; + + segs.clear(); + segs.reserve(100); + + Trace trace; + trace.next_syllable = from; + trace.s.we = words.we; + segs.push_back(trace); // empty seg + + _sent = &(Sentence&)sent; + _words = &words; +} + +bool Segmentor::step(Segmentation &result) +{ + Sentence &sent = *_sent; + Words &words = *_words; + while (!segs.empty()) { + // get one + Trace trace = segs.back(); + segs.pop_back(); + + Segmentation seg = trace.s; + int next_syllable = trace.next_syllable; + + // segmentation completed. throw it away + if (next_syllable >= nr_syllables) + continue; + + //WordEntries::iterator i_entry; + WordEntryRefs &wes = words.get_we(next_syllable); + int ii,nn = wes.size(); + for (ii = 0;ii < nn;ii ++) { + WordEntryRef i_entry = wes[ii]; + + // New segmentation for longer incomplete word + Trace newtrace; + newtrace = trace; + newtrace.s.push_back(i_entry->id); + newtrace.s.prob += i_entry->node.node->get_prob(); + + /*-it's better to compute ngram outside this function + if (ngram_enabled) { + if (newtrace.s.size() >= ngram_length) { + VocabIndex *vi = new VocabIndex[ngram_length]; + vi[0] = newtrace.s.items[len-1].node(sent).node->get_id(); + vi[1] = Vocab_None; + newtrace.s.prob += -ngram.wordProb(newtrace.s.items[len-2].node(sent).node->get_id(),vi); + delete[] vi; + } + } + */ + + newtrace.next_syllable += i_entry->len; + if (newtrace.next_syllable == nr_syllables) { + //cout << count << " " << newtrace.s << endl; + result = newtrace.s; + return true; + //result.push_back(newtrace.s); + //push_heap(result.begin(),result.end(),SegmentationComparator()); + //count ++; + } else { + segs.push_back(newtrace); + } + } + } // end while + return false; +} + +void Segmentor::done() +{ +} diff --git a/libvspell/wfst.h b/libvspell/wfst.h index d37693d..39680d9 100644 --- a/libvspell/wfst.h +++ b/libvspell/wfst.h @@ -19,30 +19,48 @@ #define SEGM_SEPARATOR 1 -struct WordInfo { - WordNode::DistanceNode exact_match; - std::vector fuzzy_match; +struct Section { + int segment; + int start; + int len; }; -typedef std::vector WordInfos; +class Sections: public std::vector
{ +public: + SentenceRef st; + void construct(const Words &words); +}; + +std::ostream& operator << (std::ostream &os,const Sections &s); + +/** + Segmentor takes a Sentence, a Words and a range, then try to generate + all possible Segmentation. + */ + +class Segmentor +{ +private: + struct Trace + { + Segmentation s; + int next_syllable; + Trace():next_syllable(0) {} + }; + int nr_syllables; + std::vector segs; + SentenceRef _sent; + Words *_words; + int from,to; -class Words:public std::vector { public: - int get_word_count() const { return size(); } - int get_len(int i) const { return (*this)[i]->size(); } - int get_fuzzy_count(int i,int l) const { - return (*(*this)[i])[l]->fuzzy_match.size(); - } - WordNode::DistanceNode& get_fuzzy(int i,int l,int f) { - return (*(*this)[i])[l]->fuzzy_match[f]; - } - const WordNode::DistanceNode& get_fuzzy(int i,int l,int f) const{ - return (*(*this)[i])[l]->fuzzy_match[f]; - } - ~Words(); // WARN: destroy all. - void print() const; + void init(const Sentence &sent, + Words &words, + int from, + int to); + bool step(Segmentation &seg); + void done(); }; -// Words[from][len].fuzzy_match[i] class WFST { @@ -60,19 +78,26 @@ public: void enable_ngram(bool enable = true) { ngram_enabled = enable; } - void get_all_words(const Sentence &sent, - Words &words); void segment_best(const Sentence &sent, - const Words &words, - Segmentation &seps); + const Words &words, + Segmentation &seps); void segment_all(const Sentence &sent, std::vector &result); -private: + + //private: +public: // for testing purpose + void generate_misspelled_words(const std::vector &pos,int len); void segment_all1(const Sentence &sent, - const Words &words, - int from,int to, - std::vector &result); - + Words &words, + int from, + int to, + //const Segmentation &base_seg, + //int seg_id, + std::vector &result); + + // variables needed when run wfst + const Sentence *p_st; + const Words *p_words; }; #endif diff --git a/libvspell/wordnode.h b/libvspell/wordnode.h index 3b0906c..e48f555 100644 --- a/libvspell/wordnode.h +++ b/libvspell/wordnode.h @@ -31,6 +31,7 @@ protected: typedef SArrayIter node_map_iterator; node_map *nodes; WordInfo *info; + VocabIndex id; public: struct DistanceNode { @@ -43,16 +44,18 @@ public: int operator < (const DistanceNode &dn1) const { return (int)node+distance < (int)dn1.node+dn1.distance; } + WordNode& operator* () const { return *node; } + WordNode* operator-> () const { return node; } }; void recalculate(); public: - WordNode(strid _syllable):info(NULL),nodes(new node_map) {} + WordNode(strid _syllable):info(NULL),nodes(new node_map),id(_syllable) {} // ~WordNode(); - // strpair get_syllable() const { return syllable; } + VocabIndex get_syllable() const { return id; } virtual WordNodePtr get_next(strid str) const; void inc_a() { ASSERT(info != NULL); info->a++; } void inc_b() { ASSERT(info != NULL); info->b++; } @@ -65,10 +68,12 @@ public: float get_prob() const { return info ? info->prob : -1; } void set_prob(float _prob); - int get_syllable_count(); - void get_syllables(std::vector &syllables); + int get_syllable_count() const; + void get_syllables(std::vector &syllables) const; WordNodePtr follow_syllables(const std::vector &syllables); + void dump_next(std::ostream &os) const; + bool load(const char* filename); bool save(const char* filename); }; @@ -92,5 +97,7 @@ public: }; WordNodePtr get_root(); +std::ostream& operator << (std::ostream &os,const WordNode::DistanceNode &dnode); +std::ostream& operator << (std::ostream &os,const WordNode &node); #endif diff --git a/libvspell/words.cpp b/libvspell/words.cpp new file mode 100644 index 0000000..025b730 --- /dev/null +++ b/libvspell/words.cpp @@ -0,0 +1,244 @@ +#include "config.h" // -*- tab-width: 2 -*- +#include "spell.h" +#include +#include +#include +#include +#include +#include +#include + + +using namespace std; + + +/** + Store information used by WFST::get_all_words() +*/ +struct WordState { + + /** + the currently processing node + */ + WordNode::DistanceNode dnode; + + int fuzid; + + WordState(const WordNode::DistanceNode &n):dnode(n),fuzid(0) {} +}; + +/** + Find all possible words + \param sent specify the input sentence + \param w must be cleared before calling this function +*/ +void Words::construct(const Sentence &sent) +{ + Words &w = *this; + int i,n,ii,nn,k,nnn,iii; + vector syll; + set we; + + w.st = &(Sentence&)sent; + + vector states; + vector old_states(states); + states.reserve(10); + old_states.reserve(10); + + n = sent.get_syllable_count(); + + syll.resize(n); + for (i = 0;i < n;i ++) { + syll[i] = sent[i].get_cid(); + //cerr << syll[i] << " "; + } + //cerr << endl; + + for (i = 0;i < n;i ++) { // pos + + // finding all possible words started at the i-th syllable. + + // init states with root node + states.clear(); + states.push_back(WordNode::DistanceNode(get_root())); + + for (k = 0;k+i < n && !states.empty(); k ++) { // length of k + // save states + old_states = states; + + /* + cerr << "State:" << i << " " << k << endl; + int jj,jnn = states.size(); + for (jj = 0;jj < jnn;jj ++) { + cerr << sarch[states[jj].dnode.node->get_syllable()] << endl; + if (states[jj].dnode.node != wl) { + states[jj].dnode.node->dump_next(cerr); + cerr << endl; + } + } + */ + + // clear states, ready for new states + states.clear(); + + // get next new states -> states + // those in old_states which has reached end will be removed from states + nn = old_states.size(); + bool has_words = false; + for (ii = 0;ii < nn;ii ++) { + WordNode::DistanceNode &state = old_states[ii].dnode; + WordNodePtr exact_node = state.node->get_next(syll[i+k]); + vector nodes; + //nodes.clear(); + //if (exact_node) + //nodes.push_back(exact_node); + state.node->fuzzy_get_next(syll[i+k],nodes); + if (exact_node && + find(nodes.begin(),nodes.end(),exact_node) == nodes.end()) + nodes.push_back(exact_node); + nnn = nodes.size(); + for (iii = 0;iii < nnn;iii ++) { + states.push_back(old_states[ii]); + states.back().dnode = nodes[iii]; + if (nodes[iii].node != exact_node) + states.back().fuzid |= 1 << k; + + // get completed words + if (states.back().dnode.node->get_prob() >= 0) { + WordEntry e; + e.pos = i; + e.len = k+1; + e.fuzid = states.back().fuzid; + e.node = states.back().dnode; + we.insert(e); + has_words = true; + //winfo.fuzzy_match.push_back(states[ii]); + } + } + } + + + // this is an unknown syllable, we presume this is an unknown word + if (k == 0 && !has_words) { + WordEntry e; + e.pos = i; + e.len = 1; + e.fuzid = 1; + e.node = get_root()->get_next(unk_id); + we.insert(e); + } + } // end of k + } // end of i + + //copy(we.begin(),we.end(),ostream_iterator(cerr)); + // copy to real _we + n = we.size(); + w.we = new WordEntries; + w.we->reserve(n); + copy(we.begin(),we.end(),back_insert_iterator >(*w.we)); + for (i = 0;i < n;i ++) + (*w.we)[i].id = i; + + // build Words structure + w.construct(); +} + +/** + Post-process after initialize core member of Words + */ + +void Words::construct() +{ + int i_we,n_we = we->size(); + + for (i_we = 0;i_we < n_we;i_we ++) + add((*we)[i_we]); +} +/** + Dump a Words + */ +ostream& operator << (ostream &os, const Words &w) +{ + int i, nn = w.get_word_count(); + for (i = 0;i < nn;i ++) { + int nnn = w.get_len(i); + for (int ii = 0;ii < nnn;ii ++) { + int nnnn = w.get_fuzzy_count(i,ii); + if (w.get_we_exact(i,ii)) + os << *w.get_we_exact(i,ii) << endl; + for (int iii = 0;iii < nnnn;iii ++) + os << w.get_we_fuzzy(i,ii,iii) << endl; + } + } + return os; +} + +Words::~Words() +{ + int pos,nr_pos = get_word_count(); + for (pos = 0;pos < nr_pos;pos ++) { + int len,nr_len = get_len(pos); + for (len = 0;len < nr_len;len ++) + delete (*(*this)[pos])[len]; + delete (*this)[pos]; + } +} + +std::ostream& operator << (std::ostream &os,const WordEntry &we) +{ + using namespace boost; + os << format("%d %d %x ") % we.pos % we.len % we.fuzid << we.node; + return os; +} + + +/** + Construct a Words based on another Words. + Keep only exact matches. + */ + +void Words::based_on(const Words &w) +{ + Words &me = *this; + we = w.we; + me.st = w.st; + + int i_pos,n_pos = w.get_word_count(); + + for (i_pos = 0;i_pos < n_pos;i_pos ++) { + int i_len,n_len = w.get_len(i_pos); + for (i_len = 0;i_len < n_len;i_len ++) + if ((*w[i_pos])[i_len]->exact_match) + add(*(*w[i_pos])[i_len]->exact_match); + } +} + +void Words::add(WordEntry &w) +{ + Words &me = *this; + while (me.size() <= w.pos) + me.push_back(new WordInfos); + + WordInfos &wis = *me[w.pos]; + + while (wis.size() <= w.len) { + wis.push_back(new WordInfo); + wis.back()->exact_match = NULL; + } + + WordInfo &wi = *wis[w.len]; + if (!w.fuzid) // exact match + wi.exact_match = &w; + else { + wi.fuzzy_match.push_back(&w); + for (int j = 0;j < w.len;j ++) + if (w.fuzid & (1 << j)) { + while (me.size() <= j+w.pos) + me.push_back(new WordInfos); + me[j+w.pos]->fuzzy_map.push_back(&w); + } + } + + wis.we.push_back(&w); +} -- 2.11.4.GIT