From 5435c457b268ccb535d3538b4a4f58f80821f077 Mon Sep 17 00:00:00 2001 From: Miriam Ruiz Date: Mon, 21 Nov 2011 03:08:54 +0100 Subject: [PATCH] Updated source code from upstream SVN svn co http://svn-rdlab.lsi.upc.edu/subversion/svmtool++/public (user: reader password: reader) See: https://groups.google.com/group/svmt/browse_thread/thread/10ec7861aa024c87?hl=es --- Makefile | 15 +- include/api.h | 78 + include/common.h | 62 +- include/dict.h | 93 +- include/er.h | 24 +- include/hash.h | 228 ++- include/learner.h | 163 +- include/list.h | 278 +++- include/mapping.h | 56 +- include/marks.h | 90 +- include/{stack.h => nodo.h} | 53 +- include/reader.h | 66 + include/stack.h | 93 +- include/strategies.h | 16 +- include/swindow.h | 254 ++-- include/tagger.h | 179 ++- include/weight.h | 54 +- src/CMakeLists.txt | 69 + src/api.cc | 273 ++++ src/bin/SVMTagger.cc | 140 +- src/bin/SVMTeval.cc | 279 ++-- src/bin/SVMTlearn.cc | 59 +- src/common.cc | 881 +++++------ src/dict.cc | 1428 +++++++++-------- src/er.cc | 53 +- src/hash.cc | 321 ---- src/learner.cc | 3532 +++++++++++++++++++++---------------------- src/list.cc | 249 --- src/mapping.cc | 645 ++++---- src/reader.cc | 129 ++ src/stack.cc | 24 +- src/swindow.cc | 2269 ++++++++++++--------------- src/tagger.cc | 1719 ++++++++++----------- src/weight.cc | 816 +++++----- 34 files changed, 7317 insertions(+), 7371 deletions(-) create mode 100755 include/api.h mode change 100644 => 100755 include/common.h mode change 100644 => 100755 include/dict.h mode change 100644 => 100755 include/er.h mode change 100644 => 100755 include/hash.h rewrite include/learner.h (71%) mode change 100644 => 100755 mode change 100644 => 100755 include/list.h mode change 100644 => 100755 include/mapping.h rewrite include/marks.h (63%) mode change 100644 => 100755 copy include/{stack.h => nodo.h} (51%) create mode 100644 include/reader.h mode change 100644 => 100755 include/stack.h mode change 100644 => 100755 include/strategies.h rewrite include/swindow.h (73%) mode change 100644 => 100755 rewrite include/tagger.h (67%) mode change 100644 => 100755 mode change 100644 => 100755 include/weight.h create mode 100644 src/CMakeLists.txt create mode 100755 src/api.cc mode change 100644 => 100755 src/bin/SVMTagger.cc mode change 100644 => 100755 src/bin/SVMTeval.cc mode change 100644 => 100755 src/bin/SVMTlearn.cc rewrite src/common.cc (71%) mode change 100644 => 100755 rewrite src/dict.cc (81%) mode change 100644 => 100755 mode change 100644 => 100755 src/er.cc delete mode 100644 src/hash.cc rewrite src/learner.cc (84%) mode change 100644 => 100755 delete mode 100644 src/list.cc rewrite src/mapping.cc (63%) mode change 100644 => 100755 create mode 100644 src/reader.cc mode change 100644 => 100755 src/stack.cc rewrite src/swindow.cc (77%) mode change 100644 => 100755 rewrite src/tagger.cc (80%) mode change 100644 => 100755 rewrite src/weight.cc (71%) mode change 100644 => 100755 diff --git a/Makefile b/Makefile index 199e8fa..b5f1473 100644 --- a/Makefile +++ b/Makefile @@ -15,25 +15,14 @@ SVMTagger: src/bin/SVMTagger.static.o $(LIBRARY).a $(LIBRARY).so MAJOR=0 MINOR=0 -SOURCES = src/learner.cc \ - src/tagger.cc \ - src/swindow.cc \ - src/mapping.cc \ - src/dict.cc \ - src/list.cc \ - src/weight.cc \ - src/hash.cc \ - src/stack.cc \ - src/er.cc \ - src/common.cc - +SOURCES = $(shell find src -name "*.cc" -maxdepth 1) INCLUDE_DIR = include SHARED_OBJS = $(SOURCES:.cc=.shared.o) STATIC_OBJS = $(SOURCES:.cc=.static.o) EXTRA_CFLAGS=-I$(INCLUDE_DIR) -STATIC_CFLAGS= -O2 -g -ansi -Wall -Wno-unused-parameter $(EXTRA_CFLAGS) +STATIC_CFLAGS= -O2 -g -std=c++0x -Wall -Wno-unused-parameter $(EXTRA_CFLAGS) SHARED_CFLAGS= $(STATIC_CFLAGS) -fPIC LDFLAGS= -Wl,-z,defs -Wl,--as-needed -Wl,--no-undefined diff --git a/include/api.h b/include/api.h new file mode 100755 index 0000000..adb782b --- /dev/null +++ b/include/api.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef API_H + + +class nodoResult +{ + private: + int index; + char *wrd; + char *pos; + char *scores; + public: + void pushIndex(int); + void pushPOS(char *); + void pushWord(char *); + void pushScores(char *); + int getIndex(); + char *getWord(); + char *getPOS(); + char *getScores(); + + nodoResult(); + ~nodoResult(); +}; + + +class apiResult +{ + private: + nodoResult *array; + int numItems; + public: + apiResult(int); + ~apiResult(); + char *getPOS(int); + char *getWord(int); + char *getScores(int); + int pushWord(char *, int); + int pushPOS(char *, int); + int pushScores(char *, int); + void print(); +}; + + +int apiInsertSentence(const char *szSentence); + +apiResult *apiTaggerRun(const char *szSentence, int iNumWords); + +int apiTaggerCreate( char *szModelName ); + +int apiTaggerInitialize ( int iStrategy, + const char *szSense, + int iWinLength, + int iWinIndex, + float fWFKnown, + float fWFUnk); + +void apiTaggerDestroy(); + +#define API_H +#endif diff --git a/include/common.h b/include/common.h old mode 100644 new mode 100755 index 66cb205..e8ca31f --- a/include/common.h +++ b/include/common.h @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -16,20 +16,25 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifndef SVMT_COMMON_H +#ifndef COMMON_H #include #include #include "list.h" +#include +#include +#include + +using namespace std; #define CLOCKS_PER_SECOND sysconf(_SC_CLK_TCK) -#define TRUE 1 -#define FALSE 0 -#define KNOWN 11 -#define UNKNOWN 22 -#define LEFT_TO_RIGHT 1 -#define RIGHT_TO_LEFT 2 -#define LR_AND_RL 3 +#define TRUE 1 +#define FALSE 0 +#define KNOWN 11 +#define UNKNOWN 22 +#define LEFT_TO_RIGHT 1 +#define RIGHT_TO_LEFT 2 +#define LR_AND_RL 3 #define TAM_MARK 10 #define TAM_POS 10 @@ -41,32 +46,41 @@ struct nodo_feature_list { - char mark[TAM_MARK]; - int n; - char *feature; - simpleList l; + nodo_feature_list() : + mark(), + l() + { + } + /* the type of the feature. see marks.h for different types */ + std::string mark; + /* words concerned, eg. (-2, -1, 0) */ + simpleList l; }; -FILE *openFile(const char *name, const char mode[]); -void generateFileName(const char *name, const char *added, int numModel, int direction, int what, const char * type, char *out); -void showProcessDone(int num,int freq, int isEnd, const char *what); +FILE *openFile(const std::string& name, const char mode[]); +void generateFileName(const std::string& name, const std::string& added, int numModel, int direction, int what, const std::string& type, string& out); + +void showProcessDone(int num,int freq, int isEnd, const std::string& what); void showProcess(int num, int isEnd); -void showTime(const char *what, double real, double utime,double stime); +//void showTime(char *what, clock_t start,clock_t end, struct tms tbuff1,struct tms tbuff2); +void showTime(const std::string& what, double real, double utime,double stime); int goToWord(FILE *f, int offset); -int readString(FILE *f, char *out); -int readTo(FILE *f, char endChar,char endLine,char *out); +int readString(FILE* f, string& out); +int readTo(FILE* f, char endChar, char endLine, string& out); //void saltarLinea(FILE *f); void qsort(int a[], int lo, int hi); -int ordenarStringPorParejas(char *szIn, char *szOut, int depth, char *szInicial); +int ordenarStringPorParejas(const char* szIn, char* szOut, int depth, char* szInicial); + +void destroyFeatureList(simpleList *); +void createFeatureList(const std::string& name,simpleList *featureList); +void removeFiles(const std::string& path, int type,int numModel, int direction, int verbose); +void Tokenize(const string& str, vector& tokens, const string& delimiters); -void destroyFeatureList(simpleList *, int); -void createFeatureList(char *, simpleList *); -void removeFiles(char *, int ,int , int, int); -#define SVMT_COMMON_H +#define COMMON_H #endif diff --git a/include/dict.h b/include/dict.h old mode 100644 new mode 100755 index 8d0c789..21ae1e1 --- a/include/dict.h +++ b/include/dict.h @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -16,64 +16,67 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include +#ifndef DICT_H -#ifndef SVMT_DICT_H +#include "list.h" +#include "hash.h" +#include +#include #define TAMTXT 100 -struct dataDict +struct infoDict { - char wrd[TAMTXT]; - int numWrd; - int numMaybe; - simpleList maybe; + std::string pos; + int num; }; -struct infoDict +struct dataDict { - char txt[TAMTXT]; - int num; + std::string wrd; + int numWrd; + int numMaybe; + simpleList maybe; }; class dictionary { - private: - hash_t d; - //FILE *in; - - // FILE *openFile(char *name, char mode[]); - void dictLoad(FILE *in); - void dictCreate(FILE *f,int offset, int limit); - void dictIncInfo(dataDict *elem, char *pos); - - int readInt(FILE *in); - infoDict *readData(FILE *in); - - public: - void dictAddBackup(char *name); - int getElement(char *key); - char *getElementWord(uintptr_t ptr); - int getElementNumWord(uintptr_t ptr); - int getElementNumMaybe(uintptr_t ptr); - simpleList *getElementMaybe(uintptr_t ptr); - char *getMFT(int w); - char *getAmbiguityClass(int w); - - hash_t *dictFindAmbP(int *numPOS); - hash_t *dictFindUnkP(int *numPOS); - void dictRepairFromFile(char *fileName); - void dictRepairHeuristic(float dratio); - - void dictCleanListInfoDict(simpleList * l, int num); + public: + dictionary(const std::string& name, const std::string& backup); + dictionary(const std::string& name); + dictionary(const std::string& name,int limInf, int limSup); + ~dictionary(); + + void dictAddBackup(const std::string& name); + void addBackupEntry(const std::string& token, const std::set& tags); + + dataDict* getElement(const std::string& key); + std::string& getElementWord(dataDict* ptr); + int getElementNumWord(dataDict* ptr); + int getElementNumMaybe(dataDict* ptr); + simpleList& getElementMaybe(dataDict* ptr); + infoDict* getMFT(dataDict* w); + std::string getAmbiguityClass(dataDict* w); + + hash_t* dictFindAmbP(int *numPOS); + hash_t* dictFindUnkP(int *numPOS); + void dictRepairFromFile(const std::string& fileName); + void dictRepairHeuristic(float dratio); + + void dictCleanListInfoDict(simpleList< infoDict* >* l, int num); + + void dictWrite(const std::string& outName); - dictionary(char *name, char *backup); - dictionary(char *name); - dictionary(char *name,int limInf, int limSup); - ~dictionary(); +private: + void dictLoad(FILE *in); + void dictCreate(FILE *f,int offset, int limit); + void dictIncInfo(dataDict *elem, const std::string& pos); + + int readInt(FILE *in); + infoDict *readData(FILE *in); - void dictWrite(char *outName); + hash_t d; }; -#define SVMT_DICT_H +#define DICT_H #endif diff --git a/include/er.h b/include/er.h old mode 100644 new mode 100755 index b29ca31..783f47d --- a/include/er.h +++ b/include/er.h @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -16,28 +16,30 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifndef SVMT_ER_H +#ifndef ER_H #include #include +#include /***************************************************************** - * Regular expression - *****************************************************************/ +* Regular expression +*****************************************************************/ -#define CARD 100 -#define CARDPUNCT 101 -#define CARDSEPS 102 -#define CARDSUFFIX 103 +#define CARD 100 +#define CARDPUNCT 101 +#define CARDSEPS 102 +#define CARDSUFFIX 103 extern regex_t erCard,erCardPunct,erCardSeps,erCardSuffix; extern regex_t erMultiWord,erContainNum,erStartCap,erStartLower,erStartNumber, erAllUp,erAllLow,erContainCap,erContainCaps,erContainPeriod,erContainComma; + void erCompRegExp(); void erFreeRegExp(); -int erLookRegExp2(void *er, char * str); -int erLookRegExp(char *m); +int erLookRegExp2(void* er, const std::string& str); +int erLookRegExp(const std::string& m); -#define SVMT_ER_H +#define ER_H #endif diff --git a/include/hash.h b/include/hash.h old mode 100644 new mode 100755 index 85f26ae..fd949fc --- a/include/hash.h +++ b/include/hash.h @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -16,45 +16,211 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifndef SVMT_HASH_H +#ifndef SVMTOOL_HASH_H +#define SVMTOOL_HASH_H -#include +#include +#include +#include +#include -#ifdef __cplusplus -extern "C" -{ +#ifndef NO_STDCPP0X +#include +#else +#include #endif -typedef struct hash_t -{ - struct hash_node_t **bucket; /* array of hash nodes */ - int size; /* size of the array */ - int entries; /* number of entries in table */ - int downshift; /* shift cound, used in hash function */ - int mask; /* used to select bits for hashing */ -} hash_t; - -typedef struct hash_node_t +#ifdef NO_STDCPP0X +namespace __gnu_cxx { - uintptr_t data; /* data in hash node */ - const char * key; /* key for hash lookup */ - struct hash_node_t *next; /* next node in hash chain */ -} hash_node_t; + template<> struct hash< std::string > + { + size_t operator()( const std::string& x ) const + { + return hash< const char* >()( x.c_str() ); + } + }; +} +#endif + +struct hash_node_t { + hash_node_t() : data(0), key(""), next(0) {} + ~hash_node_t() {} + long data; /* data in hash node */ + const char * key; /* key for hash lookup */ + struct hash_node_t *next; /* next node in hash chain */ +}; #define HASH_FAIL -1 -void hash_init(hash_t *, int); -uintptr_t hash_lookup (const hash_t *, const char *); -uintptr_t hash_insert (hash_t *, const char *, uintptr_t); -uintptr_t hash_delete (hash_t *, const char *); -void hash_destroy(hash_t *); -char *hash_stats (hash_t *); -void hash_print(hash_t *,FILE *f); -void rebuild_table(hash_t *); +#ifndef NO_STDCPP0X +#define HASH_TYPE std::unordered_map< std::string, T > +#else +#define HASH_TYPE __gnu_cxx::hash_map< std::string, T > +#endif -#ifdef __cplusplus -} +template +struct hash_t : public HASH_TYPE +{ +// void hash_init(int); +// +// long hash_lookup (const hash_t* tptr, const std::string& key) const; +// +// long hash_insert (const std::string& key, T data); +// +// long hash_delete (const std::string&); +// +// void hash_destroy(); +// +// void hash_print(FILE *f); + + #define HASH_LIMIT 0.5 + + /**************************************************/ + + // /* + // * hash() - Hash function returns a hash number for a given key. + // * + // * tptr: Pointer to a hash table + // * key: The key to create a hash number for + // */ + // static long hash(const hash_t *tptr, const char *key) + // { + // int i=0; + // long hashvalue; + // + // if (key) while (*key != '\0') i=(i<<3)+(*key++ - '0'); + // + // hashvalue = (((i*1103515249)>>downshift) & mask); + // if (hashvalue < 0) hashvalue = 0; + // + // return hashvalue; + // } + + /**************************************************/ + + /**************************************************/ + + /* + * hash_init() - Initialize a new hash table. + * + * tptr: Pointer to the hash table to initialize + * buckets: The number of initial buckets to create + */ + void hash_init(int buckets) + { + HASH_TYPE::clear(); +#ifdef NO_STDCPP0X + HASH_TYPE::resize(buckets); +#else + (void)buckets; #endif + } + + /**************************************************/ + + /* + * hash_lookup() - Lookup an entry in the hash table and return a pointer to + * it or HASH_FAIL if it wasn't found. + * + * tptr: Pointer to the hash table + * key: The key to lookup + */ + T hash_lookup(const std::string& key) const + { + typename HASH_TYPE::const_iterator it = HASH_TYPE::find(key); + if (it == HASH_TYPE::end()) + return (T)HASH_FAIL; + else + return (*it).second; + } + + /**************************************************/ + + /* + * hash_insert() - Insert an entry into the hash table. If the entry already + * exists return a pointer to it, otherwise return HASH_FAIL. + * + * tptr: A pointer to the hash table + * key: The key to insert into the hash table + * data: A pointer to the data to insert into the hash table + */ + long hash_insert(const std::string& key, T data) + { +// std::cerr << "hash_insert in " << this << ": " << key << " -> " << data << std::endl; + typename HASH_TYPE::iterator it = HASH_TYPE::find(key); + if (it != HASH_TYPE::end()) + return (long)((*it).second); + + HASH_TYPE::insert(std::make_pair(key, data)); + return HASH_FAIL; + } + + /**************************************************/ + + /* + * hash_delete() - Remove an entry from a hash table and return a pointer + * to its data or HASH_FAIL if it wasn't found. + * + * tptr: A pointer to the hash table + * key: The key to remove from the hash table + */ + T hash_delete(const std::string& key) + { + typename HASH_TYPE::iterator it = HASH_TYPE::find(key); + if (it == HASH_TYPE::end()) + return (T)HASH_FAIL; + T data = (*it).second; + HASH_TYPE::erase(it); + return data; + } + + /**************************************************/ + + /* + * hash_destroy() - Delete the entire table, and all remaining entries. + * we can't use a C++ destructor since HASH_TYPE has no virtual destructor. + */ + void hash_destroy() + { + delete_entries(*this); + HASH_TYPE::clear(); + } + + /**************************************************/ + + /* + * hash_print() - Print Keys in FILE *f + * + */ + void hash_print(FILE *f) + { + + for (typename HASH_TYPE::iterator it = HASH_TYPE::begin(); it != HASH_TYPE::end(); it++) + { + fprintf(f,"%s\n",(*it).first.c_str()); + } + } + + +}; + +/* + * delete_entries() - call the dtor on every entry in the hash. Useful is + * those entries where allocated before getting into the hash. + * + * h : the hash containing pointers. + */ +template +void delete_entries(hash_t& h) { + for (typename hash_t::iterator it = h.begin(); it != h.end(); it++) { + delete it->second; + } +} + +/* + * delete_entries() - same as before but don't delete if it's not a pointer + */ +template void delete_entries(hash_t&) { } -#define SVMT_HASH_H #endif diff --git a/include/learner.h b/include/learner.h old mode 100644 new mode 100755 dissimilarity index 71% index 6a70a03..4e055ae --- a/include/learner.h +++ b/include/learner.h @@ -1,77 +1,86 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef SVMT_LEARNER_H - -struct samples_counter_t -{ - char *key; - int positive; - int negative; -}; - -class learner -{ -private: - int learnerNumAMBP,learnerNumUNKP; - hash_t *learnerAMBP_H,*learnerUNKP_H; - swindow *sw; - simpleList learnerFeatureList,*learnerAMBP_L,*learnerUNKP_L; - FILE *fKnown,*fUnknown; - int learnerNumFeatures; - char obtainAtrChar(FILE *channel); - int obtainAtrInt(FILE *channel,int *endAtr); - void learnerCreateFeatureList(char *name, simpleList *featureList); - simpleList *learnerTransformHashInList(hash_t *tptr); - void learnerCreateDefaultFile(const char *modelName, const char *str); - void learnerCreatePOSFile(char *modelName, int is_ambp, hash_t *h); - void learnerCount(char *name, int *sentences, int *words); - int learnerExecSVMlight(char *svmdir, char *options, char *posFile, char *outFile); - int learnerLeftToRight(simpleList *, simpleList *, dictionary *dKnown, dictionary *dUnknown,/*mapping *mKnown, mapping *mUnknown,*/int numWrds, int inicio); - int learnerRightToLeft(simpleList *,simpleList *, dictionary *dKnown, dictionary *dUnknown,/*mapping *mKnown, mapping *mUnknown,*/int numWrds, int inicio); - void learnerGetFeatures(nodo *elem, stack_t *stk,dictionary *d, simpleList *featureList, int direction); - void learnerGenerateFeatures(nodo *elem,simpleList *featureList, dictionary *d, int direction); - void learnerGenerateFeaturesUnk(nodo *elem,simpleList *featureList, dictionary *d, dictionary *dUnk, int direction); - void learnerTrainModel(char *trainingFileName,dictionary *d, int numModel,int direction, int numSent, int numWords, int numChunks); - weightRepository *learnerBuiltWeightRepository(weightRepository *wr,mapping *m,char *pos,char *fileName); - hash_t *learnerBuiltBias(hash_t *,char *pos,char *fileName); - void learnerDestroyBias(hash_t *h); - int learnerIsPossiblePOS(char *wrd, char *pos, int Known_or_Unknown); - simpleList *learnerGetPotser(char *wrd, int Known_or_Unknown, dictionary *d); - void learnerTraining(FILE *f,char *modelName, int numModel,int LR_or_RL,int K_or_U,dictionary *d,simpleList *lpos); - void learnerPrintMessage(int numModel, int K_or_U, int LR_or_RL,int is_fex); - int learnerNumChunks(char *trainingFileName,float percentage,int nSentences); - int learnerIsInsideList(simpleList *l, char *key); - void learnerDoLearn(FILE *f,int numModel,int LR_or_RL,int K_or_U,dictionary *d,simpleList *lPosToTrain); - void learnerDressNakedSetTrain(dictionary *d,mapping *m,FILE *f, char* pos, int numModel, int direction, int K_or_U,int *nPos,int *nNeg); - void learnerPushSample(char *wrd,int numModel,int direction, int Known_or_Unknown, char *pos,char *samplePos,char *features,dictionary *d, int *nNeg, int *nPos); - void learnerPushSampleUnk(char *wrd,int numModel,int direction, int Known_or_Unknown,char *pos, char *samplePos, char *features,dictionary *d, int *nNeg, int *nPos); - char *learnerCreateFeatureString(FILE *f,mapping *m); - - char *read_feature_list_from_config_file(FILE *f, char *first_feature); - void read_config_file(const char *config_file); - -public: - learner(char *modelName); - learner(); - ~learner(); - void learnerRun(char *train); -}; - -#define SVMT_LEARNER_H -#endif +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#ifndef LEARNER_H + +#include "hash.h" +#include "dict.h" +#include + +struct samples_counter_t +{ + char *key; + int positive; + int negative; +}; + +class learner +{ +private: + int learnerNumAMBP,learnerNumUNKP; + hash_t* learnerAMBP_H; + hash_t* learnerUNKP_H; + swindow *sw; + simpleList learnerFeatureList; + simpleList* learnerAMBP_L; + simpleList* learnerUNKP_L; + FILE *fKnown,*fUnknown; + int learnerNumFeatures; + char obtainAtrChar(FILE *channel); + int obtainAtrInt(FILE *channel,int *endAtr); + // void learnerCreateFeatureList(char *name, simpleList *featureList); + simpleList* learnerTransformHashInList(hash_t *tptr); + void learnerCreateDefaultFile(const std::string& modelName, const std::string& str); + void learnerCreatePOSFile(const std::string& modelName, int is_ambp, hash_t< infoDict* >* h); + void learnerCount(const std::string& name, int* nWords, int* nSentences); + int learnerExecSVMlight(const std::string& svmdir, const std::string& options, const std::string& posFile, const std::string& outFile); + int learnerLeftToRight(simpleList< nodo_feature_list* >* featureList, simpleList< nodo_feature_list* >* featureListUnk, dictionary* dKnown, dictionary* dUnknown, int numWrds, int inicio); + int learnerRightToLeft(simpleList< nodo_feature_list* >* featureList, simpleList< nodo_feature_list* >* featureListUnk, dictionary* dKnown, dictionary* dUnknown, int numWrds, int inicio); + void learnerGetFeatures(nodo* elem, std::stack< std::string, std::deque< std::string, std::allocator< std::string > > >& stk, dictionary* d, simpleList< nodo_feature_list* >* featureList, int direction); + void learnerGenerateFeatures(nodo* elem, simpleList< nodo_feature_list* >* featureList, dictionary* d, int direction); + void learnerGenerateFeaturesUnk(nodo* elem, simpleList< nodo_feature_list* >* featureList, dictionary* d, dictionary* dUnk, int direction); + void learnerTrainModel(const std::string& trainingFileName, dictionary* dKnown, int numModel, int direction, int numSent, int numWords, int numChunks); + weightRepository* learnerBuiltWeightRepository(weightRepository *wr,mapping *m,const std::string& pos,const std::string& fileName); + hash_t* learnerBuiltBias(hash_t* h, const std::string& pos, const std::string& fileName); + void learnerDestroyBias(hash_t *h); + int learnerIsPossiblePOS(const std::string& wrd, const std::string& pos, int Known_or_Unknown); + simpleList* learnerGetPotser(const std::string& wrd, int Known_or_Unknown, dictionary *d); +// void learnerTraining(FILE *f,char *modelName, int numModel,int LR_or_RL,int K_or_U,dictionary *d,simpleList *lpos); + void learnerPrintMessage(int numModel, int K_or_U, int LR_or_RL,int is_fex); + int learnerNumChunks(const std::string& trainingFileName, float percentage, int nSentences); + bool learnerIsInsideList(simpleList< infoDict* >* l, const std::string& key); + void learnerDoLearn(FILE *f,int numModel,int LR_or_RL,int K_or_U,dictionary *d,simpleList* lPosToTrain); + void learnerDressNakedSetTrain(dictionary *d,mapping *m,FILE *f, const std::string& pos, int numModel, int direction, int K_or_U,int *nPos,int *nNeg); + void learnerPushSample(const std::string& wrd,int numModel,int direction, int Known_or_Unknown, const std::string& pos,const std::string& samplePos,const std::string& features,dictionary *d, int *nNeg, int *nPos); + void learnerPushSampleUnk(const std::string& wrd,int numModel,int direction, int Known_or_Unknown,const std::string& pos, const std::string& samplePos, const std::string& features,dictionary *d, int *nNeg, int *nPos); + std::string learnerCreateFeatureString(FILE *f,mapping *m); + + std::string read_feature_list_from_config_file(FILE *f, char *first_feature); + std::string read_feature_list_from_string(const std::vector< std::string >& tokens); + void read_config_file(const std::string& config_file); + +public: + learner(char *modelName); + learner(); + ~learner(); + void learnerRun(const std::string& train); +}; + +#define LEARNER_H +#endif diff --git a/include/list.h b/include/list.h old mode 100644 new mode 100755 index 84b1b48..b097ab3 --- a/include/list.h +++ b/include/list.h @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -16,43 +16,267 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifndef SVMT_LIST_H +#ifndef SIMPLELIST_H + +template struct listNode { - int ord; - void *data; - listNode *next; - listNode *previous; + int ord; + T data; + listNode* next; + listNode* previous; }; +template class simpleList { private: - //List Control - listNode *first; - listNode *last; - listNode *index; - int numObj; + //List Control + listNode* first; + listNode* last; + listNode* index; + int numObj; public: - ~simpleList(); - simpleList(); - void deleteList(); - int next(); - int previous(); - void setFirst(); - void *get(int position); - void *getIndex(); - void *getFirst(); - void *getLast(); - int show(); - int add(void *object); - int delIndex(); - int isEmpty(); - int numElements(); +// ~simpleList(); +// simpleList(); +// void deleteList(); +// int next(); +// int previous(); +// void setFirst(); +// void *get(int position); +// void *getIndex(); +// void *getFirst(); +// void *getLast(); +// int show(); +// int add(void *object); +// int delIndex(); +// int isEmpty(); +// int numElements(); + + /****************************************************************************/ + /* Simple List */ + /****************************************************************************/ + void deleteList() + { + int cont = numObj; + + if (first==0) return; + listNode* aux=first; + + while (first->next!=0 && cont >= 1) + { + aux = first; + first = first->next; + cont = cont - 1; + delete aux; + } + + delete last; + numObj = 0; + first = 0; + last = 0; + index = 0; + } + + /****************************************************************************/ + + ~simpleList() + { + deleteList(); + } + + /****************************************************************************/ + + simpleList() + { + numObj = 0; + first = 0; + last = 0; + index = 0; + } + + /****************************************************************************/ + + /*Move Interest Point to next element */ + bool next() + { + if ((index == 0) || (index->next == 0)) return false; + index = index->next; + return true; + } + + /****************************************************************************/ + + /* Move Interest Point to previous element */ + bool previous() + { + if ((index==0) || (index->previous==0)) return false; + index = index->previous; + return true; + } + + /****************************************************************************/ + + /* Get Interest Point */ + T* getIndex() + { + if ( index == 0 ) return 0; + else return &index->data; + } + + /****************************************************************************/ + + /* Get Interest Point */ + T* getFirst() + { + return &first->data; + } + + /****************************************************************************/ + + T* getLast() + { + return &last->data; + } + + /****************************************************************************/ + + void setFirst() + { + index = first; + } + + /****************************************************************************/ + + T* get(int position) + { + listNode* aux; + int i; + + if (numObj == 0 || position >= numObj) + return 0; + + aux = first; + + for(i=0; inext != 0) aux = aux->next; + else return 0; + } + return aux->data; + } + + /****************************************************************************/ + + /* Show list elements */ + int show() + { + if (first==0) return 0; + + listNode* actual=first; + + while (actual->next!=0) + { + actual=actual->next; + + } + return 0; + } + + /****************************************************************************/ + + int add(T object) + { + listNode* aux = new listNode(); + + if(numObj == 0) + { + aux->previous=0; + first = aux; + last = aux; + index = aux; + } + else + { + aux->previous = last; + last->next = aux; + last = aux; + } + + aux->ord = numObj; + aux->data = object; + aux->next=0; + numObj++; + return numObj; + } + + /****************************************************************************/ + + int delIndex() + { + listNode* aux = index; + + if(numObj == 0) return -1; + + if (index==last && index==first) + { + /* first = aux->next; + aux->previous = 0; + index = first; + last = aux->previous; + last->next = 0; + index = last;*/ + first = 0; + index = 0; + last = 0; + aux->previous = 0; + aux->next = 0; + } + else if (index==first) + { + + first = aux->next; + first->previous = 0; + index = first; + } + else if (index==last) + { + last = aux->previous; + last->next = 0; + index = last; + } + else + { + index = index->previous; + aux->previous->next = aux->next; + aux->next->previous = aux->previous; + } + + numObj--; + delete aux; + return numObj; + } + + /****************************************************************************/ + + bool isEmpty() + { + return (numObj == 0 || first == 0); + } + + /****************************************************************************/ + + int numElements() + { + return numObj; + } + + + }; -#define SVMT_LIST_H +#define SIMPLELIST_H #endif diff --git a/include/mapping.h b/include/mapping.h old mode 100644 new mode 100755 index eb1c30d..ae7f4b9 --- a/include/mapping.h +++ b/include/mapping.h @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -16,27 +16,51 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifndef SVMT_MAPPING_H +#ifndef MAPPING_H + +#include "hash.h" +/* +* Cada elemento insertado en el mapping es del tipo mapping_node_t. +* Este objeto está compuesto por: +* +* feature char * Atributo almacenado +* number char * Número identificativo de el atributo en formato alfanumérico +* num Int Número de veces que aparece +*/ +class mapping_node_t +{ + public: + char *feature; + char *number; + int num; + + ~mapping_node_t() + { + delete feature; + delete number; + } +}; + class mapping { private: - hash_t *mapByKey; - hash_t *mapByNumber; - int mapping_counter; + hash_t* mapByKey; + hash_t* mapByNumber; + int mapping_counter; public: - void mappingWrite(char *,int); - int mappingAddByKey(const char *key); - int mappingAddNumber(const char *key); - int mappingGetNumberByFeature(const char *key); - char *mappingGetFeatureByNumber(const char *key); - int mappingRepair(int maxFeat, int times); - int mappingNumElements(); - void mappingBuilt(FILE *f,int mac_mapping_size, int count_cut_off); + void mappingWrite(const char *,int); + int mappingAddByKey(const char *key); + int mappingAddNumber(const char *key); + int mappingGetNumberByFeature(const char *key); + char *mappingGetFeatureByNumber(const char *key); + int mappingRepair(int maxFeat, int times); + int mappingNumElements(); + void mappingBuilt(FILE *f,int mac_mapping_size, int count_cut_off); - mapping(); - ~mapping(); + mapping(); + ~mapping(); }; -#define SVMT_MAPPING_H +#define MAPPING_H #endif diff --git a/include/marks.h b/include/marks.h old mode 100644 new mode 100755 dissimilarity index 63% index 14cd11b..06d7770 --- a/include/marks.h +++ b/include/marks.h @@ -1,45 +1,45 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef SVMT_MARKS_H - -#define SLASTW "Swn" //Last Word -#define WMARK "w" //Words -#define PMARK "p" //POS -#define KMARK "k" //Ambiguity Classes -#define MMARK "m" //Maybe -#define MFTMARK "f" //Most Frequent Tag --> f(-1) --> f-1:NN -#define PREFIX_MARK "a" //prefixes -#define SUFFIX_MARK "z" //Suffixes -#define CHAR_A_MARK "ca" //Character, counting from the beggining of the begining of the token (starting at 1) -#define CHAR_Z_MARK "cz" //Character, counting from the end of the begining of the token (starting at 1) -#define LENGTH_MARK "L" //token length -#define START_CAPITAL_MARK "SA" //start with upper case -#define START_LOWER_MARK "sa" //start with lower case -#define START_NUMBER_MARK "SN" //start with number -#define ALL_UPPER_MARK "AA" //all upper case -#define ALL_LOWER_MARK "aa" //all lower case -#define CONTAIN_CAP_MARK "CA" //contains a capital letter -#define CONTAIN_CAPS_MARK "CAA" //contains several capital letters -#define CONTAIN_PERIOD_MARK "CP" //contains period -#define CONTAIN_COMMA_MARK "CC" //contains comma -#define CONTAIN_NUMBER_MARK "CN" //contains number -#define MULTIWORD_MARK "MW" //contains underscores (multiword) - -#define SVMT_MARKS_H -#endif +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef MARKS_H + +#define SLASTW "Swn" //Last Word +#define WMARK "w" //Words +#define PMARK "p" //POS +#define KMARK "k" //Ambiguity Classes +#define MMARK "m" //Maybe +#define MFTMARK "f" //Most Frequent Tag --> f(-1) --> f-1:NN +#define PREFIX_MARK "a" //prefixes +#define SUFFIX_MARK "z" //Suffixes +#define CHAR_A_MARK "ca" //Character, counting from the beggining of the begining of the token (starting at 1) +#define CHAR_Z_MARK "cz" //Character, counting from the end of the begining of the token (starting at 1) +#define LENGTH_MARK "L" //token length +#define START_CAPITAL_MARK "SA" //start with upper case +#define START_LOWER_MARK "sa" //start with lower case +#define START_NUMBER_MARK "SN" //start with number +#define ALL_UPPER_MARK "AA" //all upper case +#define ALL_LOWER_MARK "aa" //all lower case +#define CONTAIN_CAP_MARK "CA" //contains a capital letter +#define CONTAIN_CAPS_MARK "CAA" //contains several capital letters +#define CONTAIN_PERIOD_MARK "CP" //contains period +#define CONTAIN_COMMA_MARK "CC" //contains comma +#define CONTAIN_NUMBER_MARK "CN" //contains number +#define MULTIWORD_MARK "MW" //contains underscores (multiword) + +#define MARKS_H +#endif diff --git a/include/stack.h b/include/nodo.h similarity index 51% copy from include/stack.h copy to include/nodo.h index 6830482..407fc72 100644 --- a/include/stack.h +++ b/include/nodo.h @@ -1,11 +1,12 @@ /* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * Author: Quentin Pradet + * Copyright (C) 2011 CEA LIST * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -16,26 +17,38 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifndef SVMT_STACK_H - -#define STACKSIZE 100 +#ifndef NODO_H +#define NODO_H -typedef enum {FALSE = 0, TRUE = 1} -boolean; +#include "weight.h" +#include -typedef void *element_type; - -struct stack_t -{ - int top; - element_type items[STACKSIZE]; +struct nodo{ + nodo() : + ord(0), + weight(0), + weightOld(0), + next(NULL), + previous(NULL) {} + ~nodo() { + while(!stackScores.empty()) { + delete stackScores.top(); + stackScores.pop(); + } + + } + int ord; // word id + std::string wrd; // word (or constant like @CARD if cardinal) + std::string realWrd; // real word + std::string comment; + // to be filled by taggerSumWeight + std::string pos, posOld; + std::string strScores; + long double weight, weightOld; + std::stack stackScores; + // neighbors in the sentence + nodo *next; + nodo *previous; }; -boolean empty(struct stack_t *ps); -void init_stack(struct stack_t *ps); -element_type pop(struct stack_t *ps); -void push(struct stack_t *ps, element_type x); -element_type stack_top(struct stack_t *ps); - -#define SVMT_STACK_h #endif diff --git a/include/reader.h b/include/reader.h new file mode 100644 index 0000000..22f107e --- /dev/null +++ b/include/reader.h @@ -0,0 +1,66 @@ +/* + * Author: Quentin Pradet + * Copyright (C) 2011 CEA LIST + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef READER_H +#define READER_H + +#include +#include +#include + +class nodo; + +class reader +{ + public: + // we use std::cin to say "don't use this input stream". It is very + // wrong but was the simplest way to provid a valid reference... + reader() : m_input(std::cin) { } + reader(std::istream& input) : m_input(input), is_good(true) { } + + int nextNode(nodo**); + + /* Constructs a nodo given the textual information about it + * Does not fill the weights or the neighbor information */ + nodo* buildNode(std::string &word, std::string &comment); + + /* examples of accepted input: + * - ## this a comment + * - the + * - the # another comment + * - object (NN) + * - object (NN) # hello. + * - attack (VB,NN) + * + * return codes: + * - 1 comment + * - 0 normal sentence + * - -1 end of sentence + * - -2 end of file + */ + int parseWord(std::string& token, std::set tags, std::string &comment); + bool good() { return is_good; } + + private: + std::istream& m_input; + bool is_good; + std::string line_end(std::istringstream &iss); +}; + +#endif diff --git a/include/stack.h b/include/stack.h old mode 100644 new mode 100755 index 6830482..9a1216c --- a/include/stack.h +++ b/include/stack.h @@ -1,41 +1,52 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef SVMT_STACK_H - -#define STACKSIZE 100 - -typedef enum {FALSE = 0, TRUE = 1} -boolean; - -typedef void *element_type; - -struct stack_t -{ - int top; - element_type items[STACKSIZE]; -}; - -boolean empty(struct stack_t *ps); -void init_stack(struct stack_t *ps); -element_type pop(struct stack_t *ps); -void push(struct stack_t *ps, element_type x); -element_type stack_top(struct stack_t *ps); - -#define SVMT_STACK_h -#endif +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef STACK_H + +#define STACKSIZE 100 + +typedef enum {FALSE = 0, TRUE = 1} boolean; + +typedef void *element_type; + +/* +class stack_t +{ + public: + int top; + element_type items[STACKSIZE]; + + stack_t(); + ~stack_t(); +}; +*/ + +struct stack_t +{ + int top; + element_type items[STACKSIZE]; +}; + +boolean empty(struct stack_t *ps); +void init_stack(struct stack_t *ps); +element_type pop(struct stack_t *ps); +void push(struct stack_t *ps, element_type x); +element_type stack_top(struct stack_t *ps); + +#define STACK_h +#endif diff --git a/include/strategies.h b/include/strategies.h old mode 100644 new mode 100755 index d672c76..d42bea0 --- a/include/strategies.h +++ b/include/strategies.h @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -19,25 +19,25 @@ #ifndef STRAT_H //Strategy 0 .- one-pass (default) -#define STRA_1P_DEFAULT 0 +#define STRA_1P_DEFAULT 0 //Strategy 1 .- two-passes [revisiting results and relabeling] -#define STRA_2P_RELABELING 1 +#define STRA_2P_RELABELING 1 //Strategy 2 .- one-pass [robust against unknown words] -#define STRA_1P_ROBUST_UNK 2 +#define STRA_1P_ROBUST_UNK 2 //Strategy 3 .- one-pass [unsupervised learning models] -#define STRA_1P_UNSUPERVISED 3 +#define STRA_1P_UNSUPERVISED 3 //Strategy 4 .- one-pass [very robust against unknown words] -#define STRA_1P_VERY_ROBUST_UNK 4 +#define STRA_1P_VERY_ROBUST_UNK 4 //Strategy 5 .- one-pass [sentence-level likelihood] -#define STRA_1P_SENTENCE_LEVEL 5 +#define STRA_1P_SENTENCE_LEVEL 5 //Strategy 6 .- one-pass [robust sentence-level likelihood] -#define STRA_1P_ROBUST_SENTENCE_LEVEL 6 +#define STRA_1P_ROBUST_SENTENCE_LEVEL 6 #define STRAT_H #endif diff --git a/include/swindow.h b/include/swindow.h old mode 100644 new mode 100755 dissimilarity index 73% index 4e5baac..3bd919e --- a/include/swindow.h +++ b/include/swindow.h @@ -1,124 +1,130 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef SWINDOW_H - -#include "common.h" - -#define CHAR_NULL '~' -#define EMPTY_WORD "_" -#define EMPTY_POS "??" -#define EMPTY "" -#define LEFT_TO_RIGHT 1 -#define RIGHT_TO_LEFT 2 -#define PUT_MAX 0 -#define RESET_VALUES 1 -#define PUT_OLD 2 - -#define COUNTING_FROM_END 111 -#define COUNTING_FROM_BEGIN 222 - -struct nodo -{ - int ord; - char wrd[TAM_WORD]; - char realWrd[TAM_WORD]; - char comment[TAM_LINE]; - char pos[TAM_POS],posOld[TAM_POS]; - long double weight,weightOld; - struct stack_t *stackScores; - nodo *next; - nodo *previous; -}; - -class swindow -{ - private: - FILE *input; - //List Control - nodo *first; - nodo *last; - int numObj; - - //Window Control - nodo *index,*beginWin,*endWin; - int lengthWin,posIndex,posBegin,posEnd; - - void init(); - int iniList(); - - int readSentence(); - int readInput(); - int readInput_old(); - int winAdd(char *wrd, char *pos); - - int winLookRegExp2(void *er,char *str); - void winCompRegExp(); - void winFreeRegExp(); - - public: - int winLookRegExp(char *m); - int winMaterializePOSValues(int action); - - ~swindow(); - swindow(FILE *input); - swindow(FILE *input,int number, int position); - swindow(FILE *input,int number); - int next(); - int previous(); - nodo *getIndex(); - nodo *get(int position,int direction); - int show(); - - void putLengthWin(int l); - void putIndex(int i); - - int sentenceLength(); - - int winExistUnkWord(int direction, dictionary *d); - - void winPushWordFeature(void *ptr,dictionary *d, struct stack_t *pila,int direction); - void winPushPosFeature(void *ptr,dictionary *d, struct stack_t *pila,int direction); - void winPushAmbiguityFeature(void *ptr,dictionary *d, struct stack_t *pila,int direction); - void winPushMFTFeature(void *ptr,dictionary *d, struct stack_t *pila,int direction); - void winPushMaybeFeature(void *ptr,dictionary *d, struct stack_t *pila,int direction); - void winPushSwnFeature(struct stack_t *pila); - void winPushUnknownFeatures(char *str, struct stack_t *pila); - - void winPushSuffixFeature(char *wrd, struct stack_t *pila,int longitud); - void winPushPrefixFeature(char *wrd, struct stack_t *pila,int longitud); - //void winPushStartCapFeature(char *wrd, struct stack_t *pila); - void winPushAllUpFeature(char *wrd,stack_t *pila); - void winPushAllLowFeature(char *wrd,stack_t *pila); - void winPushContainCapFeature(char *wrd, stack_t *pila); - void winPushContainCapsFeature(char *wrd, stack_t *pila); - void winPushContainPeriodFeature(char *wrd, stack_t *pila); - void winPushContainCommaFeature(char *wrd, stack_t *pila); - void winPushContainNumFeature(char *wrd, stack_t *pila); - void winPushMultiwordFeature(char *wrd, stack_t *pila); - void winPushLetterFeature(char *, stack_t *, int, int ); - void winPushLenghtFeature(char *wrd, stack_t *pila); - void winPushStartWithCapFeature(char *,stack_t *); - void winPushStartWithLowerFeature(char *,stack_t *); - void winPushStartWithNumberFeature(char *,stack_t *); - int iniGeneric(); - void deleteList(); -}; - -#define SWINDOW_W -#endif +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef SWINDOW_H +#define SWINDOW_H + +#include "common.h" +#include "dict.h" +#include "reader.h" +#include +#include +#include + +#define CHAR_NULL '~' +#define EMPTY_WORD "_" +#define EMPTY_POS "??" +#define EMPTY "" +#define LEFT_TO_RIGHT 1 +#define RIGHT_TO_LEFT 2 +#define PUT_MAX 0 +#define RESET_VALUES 1 +#define PUT_OLD 2 + +#define COUNTING_FROM_END 111 +#define COUNTING_FROM_BEGIN 222 + + +struct weight_node_t; +struct nodo; + +class swindow +{ + public: + std::ostream* m_output; + reader m_reader; + + //List Control + nodo *first; + nodo *last; + int numObj; + + //Window Control + nodo *index,*beginWin,*endWin; + int lengthWin,posIndex,posBegin,posEnd; + + //User window: this allows users to specify their own windows + std::vector user_window; + + void init(dictionary* dic /*= 0*/); + int iniList(dictionary* dic /*= 0*/); + + int readSentence(dictionary* dic /*= 0*/); + int readInput(dictionary* dic); + void winAdd(nodo * node); + + int winLookRegExp2(void *er,const std::string& str); + void winCompRegExp(); + void winFreeRegExp(); + + public: + int winLookRegExp(const std::string& m); + int winMaterializePOSValues(int action); + + ~swindow(); + swindow(std::istream& input, std::ostream* output, dictionary* dic /*= 0*/); + swindow(std::istream& input,int number, int position, std::ostream* output, dictionary* dic /*= 0*/); + swindow(std::istream& input,int number, std::ostream* output, dictionary* dic /*= 0*/); + swindow(int lengthWin, dictionary *dic); + bool next(); + bool previous(); + nodo *getIndex(); + nodo *get(int position,int direction); + nodo *get_user(int position); + + nodo *get_intern(int position); + int show(int showScoresFlag, int showComments); + + void putLengthWin(int l); + void putIndex(int i); + + int sentenceLength(); + + void setWindow(const std::vector&); + + int winExistUnkWord(int direction, dictionary *d); + + void winPushWordFeature(void* ptr, dictionary* d, stack< string >& pila, int direction); + void winPushPosFeature(void *ptr,dictionary *d, std::stack& pila,int direction); + void winPushAmbiguityFeature(void *ptr,dictionary *d, std::stack& pila,int direction); + void winPushMFTFeature(void *ptr,dictionary *d, std::stack& pila,int direction); + void winPushMaybeFeature(void *ptr,dictionary *d, std::stack& pila,int direction); + void winPushSwnFeature(std::stack& pila); + + void winPushSuffixFeature(const std::string& wrd, std::stack& pila,int longitud); + void winPushPrefixFeature(const std::string& wrd, std::stack& pila,int longitud); + //void winPushStartCapFeature(const std::string& wrd, struct std::stack *pila); + void winPushAllUpFeature(const std::string& wrd, std::stack& pila); + void winPushAllLowFeature(const std::string& wrd, std::stack& pila); + void winPushContainCapFeature(const std::string& wrd, std::stack& pila); + void winPushContainCapsFeature(const std::string& wrd, std::stack& pila); + void winPushContainPeriodFeature(const std::string& wrd, std::stack& pila); + void winPushContainCommaFeature(const std::string& wrd, std::stack& pila); + void winPushContainNumFeature(const std::string& wrd, std::stack& pila); + void winPushMultiwordFeature(const std::string& wrd, std::stack& pila); + void winPushLetterFeature(const std::string& wrd, std::stack& pila, int where, int position ); + void winPushLenghtFeature(const std::string& wrd, std::stack& pila); + void winPushStartWithCapFeature(const std::string& wrd, std::stack& pila ); + void winPushStartWithLowerFeature(const std::string& wrd, std::stack& pila ); + void winPushStartWithNumberFeature(const std::string& wrd, std::stack& pila ); + int iniGeneric(dictionary* dic /*= 0*/); + void deleteList(); +}; + + +#endif diff --git a/include/tagger.h b/include/tagger.h old mode 100644 new mode 100755 dissimilarity index 67% index c714aa5..86694a8 --- a/include/tagger.h +++ b/include/tagger.h @@ -1,82 +1,97 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef TAGGER_H - -struct models_t -{ - weightRepository *wr,*wr2,*wrUnk,*wrUnk2; - //hash_t *bias,*bias2,*biasUnk,*biasUnk2; - simpleList featureList; - simpleList featureListUnk; -}; - -class tagger -{ - private: - //Flow Control - char flow[10]; - int taggerStrategy,taggerNumLaps,taggerWinIndex,taggerWinLength; - float taggerKFilter,taggerUFilter; - char taggerBackupDict[150],taggerModelName[150]; - - struct stack_t *stk; - models_t *taggerModelList; - models_t *taggerModelRunning; - dictionary *d; - swindow *sw; - weight_node_t *weightUnk; - - int taggerRightSense(); - int taggerLeftSense(); - - void taggerSumWeight(weightRepository *wRep,hash_t *bias,weight_node_t *weight,int numMaybe, int *max); - void taggerGenerateScore(nodo *elem,int direction); - - weight_node_t *taggerCreateWeightNodeArray(int numMaybe,int index); - weight_node_t *taggerInitializeWeightNodeArray(int numMaybe,weight_node_t *w); - weight_node_t *taggerCreateWeightUnkArray(char *name); - hash_t *taggerCreateBiasHash(char *name); - void taggerLoadModels(models_t *model, int taggerNumModel); - - void taggerStadistics(int numWords, int numSentences, double realTime,double usrTime, double sysTime); - void taggerShowVerbose(int num,int isEnd); - - int taggerRightSenseSpecialForUnknown(); - int taggerLeftSenseSpecialForUnknown(); - void taggerDoNormal(int *numWords, int *numSentences); - void taggerDoSpecialForUnknown(int *numWords, int *numSentences); - void taggerDoNTimes(int *numWords, int *numSentences,int laps); - - public: - void taggerRun(); - void taggerLoadModelsForTagging(); - void taggerPutFlow(char *inFlow); - void taggerPutBackupDictionary(char *dictName); - void taggerPutStrategy(int num); - void taggerPutWinLength(int l); - void taggerPutWinIndex(int i); - void taggerPutKWeightFilter(float kfilter); - void taggerPutUWeightFilter(float ufilter); - void taggerInit(); - tagger(char *model); - ~tagger(); -}; - -#define TAGGER_H -#endif +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef TAGGER_H +#define TAGGER_H + +#include "common.h" +#include "weight.h" +#include "list.h" +#include "hash.h" +#include "swindow.h" +#include "dict.h" +#include "er.h" + +struct models_t{ + weightRepository *wr,*wr2,*wrUnk,*wrUnk2; + //hash_t *bias,*bias2,*biasUnk,*biasUnk2; + simpleList featureList; + simpleList featureListUnk; +}; + + +class tagger +{ + public: + bool taggerShowScoresFlag,taggerShowCommentsFlag; + + //Flow Control + std::string flow; + int taggerStrategy,taggerNumLaps,taggerWinIndex,taggerWinLength; + float taggerKFilter,taggerUFilter; + std::string taggerBackupDict,taggerModelName; + + std::stack stk; + models_t *taggerModelList; + models_t *taggerModelRunning; + dictionary *d; + swindow *sw; + + int taggerRightSense(); + int taggerLeftSense(); + + std::string taggerSumWeight(weightRepository* wRep, weight_node_t* weight, int numMaybe, int* max); + void taggerGenerateScore(nodo *elem,int direction); + + weight_node_t *taggerCreateWeightNodeArray(int numMaybe, dataDict* index); + weight_node_t *taggerCreateWeightUnkArray(int *numMaybe); + hash_t *taggerCreateBiasHash(const std::string& name); + void taggerLoadModels(models_t *model, int taggerNumModel); + + void taggerStadistics(int numWords, int numSentences, double realTime,double usrTime, double sysTime); + void taggerShowVerbose(int num,int isEnd); + + + int taggerRightSenseSpecialForUnknown(); + int taggerLeftSenseSpecialForUnknown(); + void taggerDoNormal(int *numWords, int *numSentences); + void taggerDoSpecialForUnknown(int *numWords, int *numSentences); + void taggerDoNTimes(int *numWords, int *numSentences,int laps); + +public: + void taggerRun(); + void taggerLoadModelsForTagging(); + void taggerShowNoComments(); + void taggerShowComments(); + void taggerActiveShowScoresFlag(); + void taggerDesactiveShowScoresFlag(); + void taggerPutFlow(const std::string& inFlow); + void taggerPutBackupDictionary(const std::string& dictName); + void taggerPutStrategy(int num); + void taggerPutWinLength(int l); + void taggerPutWinIndex(int i); + void taggerPutKWeightFilter(float kfilter); + void taggerPutUWeightFilter(float ufilter); + void taggerInit(std::istream& input, std::ostream& output); + void taggerInit(); + tagger(const std::string& model); + ~tagger(); +}; + + +#endif diff --git a/include/weight.h b/include/weight.h old mode 100644 new mode 100755 index ebe7d51..2d9fb85 --- a/include/weight.h +++ b/include/weight.h @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -16,36 +16,40 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifndef SVMT_WEIGHT_H +#ifndef WEIGHT_H +#define WEIGHT_H -typedef struct weight_node_t -{ - char pos[5]; - long double data; -} weight_node_t; +#include "hash.h" -class weightRepository +struct weight_node_t { -private: - hash_t wr; + weight_node_t() : pos(), data(0) {} + std::string pos; + long double data; +}; - //char *wrGetMergeInput(hash_t *tptr); //DEL 180705 - char *wrGetMergeInput(hash_t *tptr, float filter); //ADD 180705 - FILE *openFile(char *name, char mode[]); - void wrReadMergeModel(FILE *in,float filter); - char wrSaltarBlancs(FILE *in,char c,int jmp); - void wrAddPOS(uintptr_t obj, char* pos, long double weight); +class weight_struct_t; +class weightRepository +{ + private: + hash_t wr; + + std::string wrGetMergeInput(hash_t *tptr, float filter); //ADD 180705 + //char *wrGetMergeInput(hash_t *tptr); //DEL 180705 + FILE *openFile(const std::string& name, char mode[]); + void wrReadMergeModel(FILE *in,float filter); + char wrSaltarBlancs(FILE *in,char c,int jmp); + void wrAddPOS(unsigned long obj, const std::string& pos, long double weight); public: - long double wrGetWeight(const char *feature,char *pos); - void wrAdd(char *feature, char* pos, long double weight); - //void wrWrite(const char *outName); //DEL 180705 - void wrWrite(const char *outName, float filter); //ADD 180705 - void wrWriteHash(hash_t *tptr,FILE *f,char separador); - weightRepository(char *fileName,float filter); - weightRepository(); - ~weightRepository(); + long double wrGetWeight(const std::string& feature,const std::string& pos); + void wrAdd(const std::string& feature, const std::string& pos, long double weight); + //void wrWrite(char *outName); //DEL 180705 + void wrWrite(const std::string& outName, float filter); //ADD 180705 + void wrWriteHash(hash_t *tptr,FILE *f,char separador); + weightRepository(const std::string& fileName,float filter); + weightRepository(); + ~weightRepository(); }; -#define SVMT_WEIGHT_H #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..93e1cad --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,69 @@ +########### next target ############### + +SET(svmtool_LIB_SRCS + swindow.cc + weight.cc + dict.cc + tagger.cc + common.cc + er.cc + mapping.cc + api.cc + reader.cc +) + +add_library(svmtool SHARED ${svmtool_LIB_SRCS}) + +target_link_libraries(svmtool) + +set_target_properties(svmtool PROPERTIES VERSION 1.1.6 SOVERSION 1) +install(TARGETS svmtool DESTINATION lib) + +########### next target ############### + +SET(SVMTagger_SRCS + SVMTagger.cc +) + +add_executable(SVMTagger ${SVMTagger_SRCS}) + +target_link_libraries(SVMTagger svmtool) + +install(TARGETS SVMTagger DESTINATION bin) + +########### next target ############### + +SET(SVMTeval_SRCS + SVMTeval.cc +) + +add_executable(SVMTeval ${SVMTeval_SRCS}) + +target_link_libraries(SVMTeval svmtool) + +install(TARGETS SVMTeval DESTINATION bin) + +########### next target ############### + +SET(SVMTlearn_SRCS +SVMTlearn.cc +learner.cc +) + +add_executable(SVMTlearn ${SVMTlearn_SRCS}) + +target_link_libraries(SVMTlearn svmtool) + +install(TARGETS SVMTlearn DESTINATION bin) + +########### install files ############### +install(FILES + tagger.h + common.h + weight.h + list.h + hash.h + swindow.h + dict.h + er.h +DESTINATION include/svmtool) diff --git a/src/api.cc b/src/api.cc new file mode 100755 index 0000000..d080977 --- /dev/null +++ b/src/api.cc @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include "api.h" +#include "hash.h" +#include "list.h" +#include "dict.h" +#include "weight.h" +#include "swindow.h" +#include "tagger.h" +#include "er.h" + +/*****************************************************************/ + +nodoResult::nodoResult() +{ + this->pos = NULL; + this->wrd = NULL; + this->scores = NULL; +} + +nodoResult::~nodoResult() +{ + delete[] this->wrd; + delete[] this->pos; + delete[] this->scores; + index = 0; +} + +void nodoResult::pushIndex(int iNum) +{ + this->index = iNum; +} + +void nodoResult::pushScores(char *text) +{ + this->scores = new char[strlen(text)+1]; + strcpy(this->scores,text); +} + +void nodoResult::pushPOS(char *text) +{ + this->pos = new char[strlen(text)+1]; + strcpy(this->pos,text); +} + +void nodoResult::pushWord(char *text) +{ + this->wrd = new char[strlen(text)+1]; + strcpy(this->wrd,text); +} + +int nodoResult::getIndex() +{ + return this->index; +} + +char *nodoResult::getScores() +{ + return this->scores; +} + +char *nodoResult::getWord() +{ + return this->wrd; +} + +char *nodoResult::getPOS() +{ + return this->pos; +} + +/*****************************************************************/ + +apiResult::apiResult(int iNum) +{ + numItems = iNum; + array = new nodoResult[iNum]; +} + +apiResult::~apiResult() +{ + delete[] array; +} + +int apiResult::pushScores(char *text, int iPos) +{ + if (iPos >= this->numItems) return -1; + array[iPos].pushScores(text); + return 0; +} + +int apiResult::pushPOS(char *text, int iPos) +{ + if (iPos >= this->numItems) return -1; + array[iPos].pushPOS(text); + return 0; +} + +int apiResult::pushWord(char *text, int iPos) +{ + if (iPos >= this->numItems) return -1; + array[iPos].pushWord(text); + return 0; +} + +char *apiResult::getScores(int iPos) +{ + return array[iPos].getScores(); +} + +char *apiResult::getPOS(int iPos) +{ + return array[iPos].getPOS(); +} + +char *apiResult::getWord(int iPos) +{ + return array[iPos].getWord(); +} + +void apiResult::print() +{ + for (int i=0; i < this->numItems; i++) + { + fprintf(stdout, "%s %s %s\n",this->getWord(i),array[i].getPOS(),array[i].getScores()); + } +} +/*****************************************************************/ + +tagger *t; +int verbose = 0; + +int apiInsertSentence(const char *szSentence) +{ + char wrd[200]; + FILE *f = fopen ("stdin.tmp","w"); + unsigned int i = 0; + + if ( t == NULL ) return -1; + + for (i=0; szSentence[i] !='\0' && i < strlen(szSentence); i++) + { + strcpy(wrd,""); + int ret = sscanf(szSentence+i,"%s ",wrd); + + if (ret > 0) + { + fprintf(f,"%s\n",wrd); + i = i + strlen(wrd); + } + else break; + + } + + fclose(f); + + return i; +} + +apiResult *apiTaggerRun(const char *szSentence, int iNumWords) +{ + if ( t == NULL ) return NULL; + + int ret = apiInsertSentence (szSentence); + + if ( ret == -1 ) return NULL; + + char tmp1[500],tmp2[500], tmp3[2000]; + char aux[3000]; + + fflush(stdin); + std::ifstream in("stdin.tmp"); + std::ofstream fout("stdout.tmp"); + + t->taggerShowNoComments(); + t->taggerActiveShowScoresFlag(); + + t->taggerInit(in,fout); + t->taggerRun(); + fout.close(); + + FILE *f = fopen("stdout.tmp","r"); + + int index = 0; + apiResult *out = new apiResult(iNumWords); + + while (!feof(f) && index < iNumWords) + { + strcpy(aux,""); + strcpy(tmp1,""); strcpy(tmp2,""); strcpy(tmp3,""); + char *ret = fgets(aux,3000,f); + if (ret != NULL ) + { + sscanf(aux,"%s %s",tmp1, tmp2); + if ((strlen(tmp1)+strlen(tmp2)+2) < strlen(aux)) + { + strcpy(tmp3,aux+strlen(tmp1)+1+strlen(tmp2)+1); + tmp3[strlen(tmp3)-1] = '\0'; + } + out->pushWord(tmp1, index); + out->pushPOS (tmp2, index); + out->pushScores (tmp3,index); + index++; + } + + } + + remove ( "stdin.tmp" ); + remove ( "stdout.tmp" ); + return out; +} + + +int apiTaggerCreate( char *szModelName ) +{ + if ( strcmp(szModelName,"") == 0 ) return -1; + + t = new tagger(szModelName); + + return 0; +} + +int apiTaggerInitialize ( int iStrategy, + const char *szSense, + int iWinLength, + int iWinIndex, + float fWFKnown, + float fWFUnk) +{ + if ( t == NULL ) return -1; + + verbose = 0; + + erCompRegExp(); + + if ( fWFKnown != -1 ) t->taggerPutKWeightFilter(fWFKnown); + if ( fWFUnk != -1 ) t->taggerPutUWeightFilter(fWFUnk); + if ( iStrategy != -1 ) t->taggerPutStrategy(iStrategy); + if ( strcmp (szSense,"") == 0 ) t->taggerPutFlow(szSense); + if ( iWinLength != -1 ) t->taggerPutWinLength(iWinLength); + if ( iWinIndex != -1 ) t->taggerPutWinIndex(iWinIndex); + + t->taggerLoadModelsForTagging(); + return 0; +} + +void apiTaggerDestroy() +{ + erFreeRegExp(); + if ( t != NULL ) delete t; + t = NULL; +} + diff --git a/src/bin/SVMTagger.cc b/src/bin/SVMTagger.cc old mode 100644 new mode 100755 index 0386120..05683c6 --- a/src/bin/SVMTagger.cc +++ b/src/bin/SVMTagger.cc @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -27,22 +27,22 @@ #include "tagger.h" #include "er.h" -#define ERROR1 "\nERROR: Value incorrect in -T option.\n" -#define ERROR2 "\nERROR: Value incorrect in -S option.\n" -#define ERROR3 "\nERROR: Value incorrect in -I option.\n" -#define ERROR4 "\nERROR: Value incorrect in -L option.\n" -#define ERROR5 "\nERROR: Value incorrect in -I or -L option. Window length have to be greater than Interest Point.\n" -#define ERROR6 "\nERROR: You can not change Interes Point (-I) if you don't change the length of the window (-L).\n" -#define ERROR8 "\nERROR: Incorrect Options\n" +#define ERROR1 "\nERROR: Value incorrect in -T option.\n" +#define ERROR2 "\nERROR: Value incorrect in -S option.\n" +#define ERROR3 "\nERROR: Value incorrect in -I option.\n" +#define ERROR4 "\nERROR: Value incorrect in -L option.\n" +#define ERROR5 "\nERROR: Value incorrect in -I or -L option. Window length have to be greater than Interest Point.\n" +#define ERROR6 "\nERROR: You can not change Interes Point (-I) if you don't change the length of the window (-L).\n" +#define ERROR8 "\nERROR: Incorrect Options\n" #define ERRORK "\nERROR: Value incorrect in -K option.\n" #define ERRORU "\nERROR: Value incorrect in -U option.\n" -extern int verbose_svmtool; +int verbose = 0; -void printHelp() +void printHelp(char *progname) { - fprintf(stderr,"\nSVMTool++ v 1.1.2 -- SVMTagger\n\n"); - fprintf(stderr,"Usage : svmt [options] < stdin > stdout\n\n"); + fprintf(stderr,"\nSVMTool++ v 1.1.6 -- SVMTagger\n\n"); + fprintf(stderr,"Usage : %s [options] < stdin > stdout\n\n", progname); fprintf(stderr,"options:\n"); fprintf(stderr,"\t-L or -l \n"); fprintf(stderr,"\t\t have to be greater than 2\n"); @@ -59,18 +59,18 @@ void printHelp() fprintf(stderr,"\t\t1\ttwo-passes [revisiting results and relabeling - requires model 2 and model 1]\n"); fprintf(stderr,"\t\t2\tone-pass [robust against unknown words - requires model 0 and model 2]\n"); fprintf(stderr,"\t\t3\tone-pass [unsupervised learning models - requires model 3]\n"); - fprintf(stderr,"\t\t4\tone-pass [very robust against unknown words - requires model 4]\n"); + fprintf(stderr,"\t\t4\tone-pass [very robust against unknown words - requires model 4]\n"); fprintf(stderr,"\t\t5\tone-pass [sentence-level likelihood - requires model 0] Not implemented!!\n"); fprintf(stderr,"\t\t6\tone-pass [robust sentence-level likelihood - requires model 4] Not implemented!!\n"); fprintf(stderr,"\t-B or -b \n"); fprintf(stderr,"\t-K weight filtering threshold for known words (default is 0)\n"); fprintf(stderr,"\t-U weight filtering threshold for unknown words (default is 0)\n"); fprintf(stderr,"\t-V or -v verbose\n"); + fprintf(stderr,"\t-A or -a show scores\n"); fprintf(stderr,"\nmodel: model location (path/name)\n"); - fprintf(stderr,"\nUsage : SVMTagger -V -S LRL -T 0 /home/users/me/SVMTool/models/eng/WSJTP < WSJTP.TEST > WSJTP.TEST.OUT\n\n"); + fprintf(stderr,"\nUsage : %s -V -S LRL -T 0 /home/usuaris/smoya/SVMT/eng/WSJTP < WSJTP.TEST > WSJTP.TEST.OUT\n\n", progname); } - /* -1 -t error -2 -s error @@ -84,29 +84,26 @@ int options(int argc,char *argv[]) int isIP=0,ip=4,isLength=0,length=7; for (int i=1;i \n\n"); + fprintf(stderr,"\nSVMTool++ v 1.1.6 -- SVMTeval\n\n"); + fprintf(stderr,"Usage : %s [mode] \n\n", progname); fprintf(stderr,"\t- mode:\t0 - complete report (everything)\n"); fprintf(stderr,"\t\t1 - overall accuracy only [default]\n"); fprintf(stderr,"\t\t2 - accuracy of known vs unknown words\n"); @@ -56,106 +55,95 @@ void printHelp() fprintf(stderr,"\t- model: model location (path + name)\n"); fprintf(stderr,"\t- gold: correct tagging file\n"); fprintf(stderr,"\t- pred: predicted tagging file\n\n"); - fprintf(stderr,"Example : SVMTeval WSJTP WSJTP.IN WSJTP.OUT\n\n"); + fprintf(stderr,"Example : %s WSJTP WSJTP.IN WSJTP.OUT\n\n", progname); } - /************************************************************************/ class eval { - private: - char sModel[500]; - char sGold[500]; - char sPred[500]; - FILE *gold; - FILE *pred; - dictionary *d; - hash_t stat_Amb_Level; - hash_t stat_Class_Amb; - hash_t stat_POS; - int report_type; - int numAmbLevel; - int numAmbClass; - - void printHashStats(hash_t *tptr, int put_eol, const char *column_name); - void printKnownVsUnknown(int knownAmb, int knownUnamb, int unknown,int unkHits, int knownHitsAmb,int knownHitsUnamb); - void printTaggingSumary(int known,int unknown,int ambiguous,int well,int wellMFT); - void printOverallAccuracy(int total, int well, int wellMFT, float pAmb); - void printStatsByLevel(hash_t *h); - void printStatsByAmbiguityClass(hash_t *h); - void printStatsByPOS(hash_t *h); - void addStatsToHash(hash_t *h,char *key,int is_hit, int is_mft); - void makeReport(dictionary *d,FILE *gold, FILE *pred); - - public: - eval(char *model, char *goldName, char *predName); - void evalPutReportType(int report); - void evalRun(); +private: + char sModel[500]; + char sGold[500]; + char sPred[500]; + FILE *gold; + FILE *pred; + dictionary *d; + hash_t stat_Amb_Level; + hash_t stat_Class_Amb; + hash_t stat_POS; + int report_type; + int numAmbLevel; + int numAmbClass; + + void printHashStats(hash_t *tptr, int put_eol, const char *column_name); + void printKnownVsUnknown(int knownAmb, int knownUnamb, int unknown,int unkHits, int knownHitsAmb,int knownHitsUnamb); + void printTaggingSumary(int known,int unknown,int ambiguous,int well,int wellMFT); + void printOverallAccuracy(int total, int well, int wellMFT, float pAmb); + void printStatsByLevel(hash_t *h); + void printStatsByAmbiguityClass(hash_t *h); + void printStatsByPOS(hash_t *h); + void addStatsToHash(hash_t< stat_t* >* h, const std::string& key, int is_hit, int is_mft); + void makeReport(dictionary *d,FILE *gold, FILE *pred); + +public: + eval(char *model, char *goldName, char *predName); + void evalPutReportType(int report); + void evalRun(); }; /******************************************************************/ -void eval::printHashStats(hash_t *tptr, int put_eol, const char *column_name) +void eval::printHashStats(hash_t *tptr, int put_eol, const char *column_name) { - fprintf(stderr,"%s\tHITS\t\tTRIALS\t\tACCURACY\t\tMFT-ACCURACY\n",column_name); - fprintf(stderr,"* ------------------------------------------------------------------------- \n"); - hash_node_t *node, *last; - int i; - - char c='\0'; - if (put_eol==TRUE) c='\n'; - - for (i=0; isize; i++) - { - node = tptr->bucket[i]; - while (node != NULL) - { - last = node; - node = node->next; - stat_t *s = (stat_t *)last->data; - fprintf(stderr,"%s%c\t%d\t/\t%d\t=\t%.4f %\t\t%.4f %\n",s->key,c,s->hits,s->trials, 100*((float)s->hits/s->trials), 100*((float)s->mft/s->trials) ); - } - } - + fprintf(stderr,"%s\tHITS\t\tTRIALS\t\tACCURACY\t\tMFT-ACCURACY\n",column_name); + fprintf(stderr,"* ------------------------------------------------------------------------- \n"); +// hash_node_t *node, *last; +// int i; + + char c='\0'; + if (put_eol==TRUE) c='\n'; + + for (hash_t::iterator it = tptr->begin(); it != tptr->end(); it++) + { + stat_t *s = (stat_t *)((*it).second); + fprintf(stderr,"%s%c\t%d\t/\t%d\t=\t%.4f %%\t\t%.4f %%\n",s->key.c_str(),c,s->hits,s->trials, 100*((float)s->hits/s->trials), 100*((float)s->mft/s->trials) ); + } } - /******************************************************************/ void eval::printKnownVsUnknown(int knownAmb, int knownUnamb, int unknown,int unkHits, int knownHitsAmb,int knownHitsUnamb) { int knownHits = knownHitsAmb + knownHitsUnamb; int known = knownAmb + knownUnamb; - + fprintf(stderr,"* ================= KNOWN vs UNKNOWN WORDS ================================\n"); fprintf(stderr,"\tHITS\t\tTRIALS\t\tACCURACY\n"); fprintf(stderr,"* -------------------------------------------------------------------------\n"); fprintf(stderr,"* ======= known ===========================================================\n"); - fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f %\n",knownHits,known,100*((float)knownHits/known)); + fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f %%\n",knownHits,known,100*((float)knownHits/known)); fprintf(stderr,"-------- known unambiguous words ------------------------------------------\n"); - fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f %\n",knownHitsUnamb,knownUnamb,100*((float)knownHitsUnamb/knownUnamb)); + fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f %%\n",knownHitsUnamb,knownUnamb,100*((float)knownHitsUnamb/knownUnamb)); fprintf(stderr,"-------- known ambiguous words --------------------------------------------\n"); - fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f %\n",knownHitsAmb,knownAmb,100*((float)knownHitsAmb/knownAmb)); + fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f %%\n",knownHitsAmb,knownAmb,100*((float)knownHitsAmb/knownAmb)); fprintf(stderr,"* ======= unknown =========================================================\n"); - fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f %\n",unkHits,unknown,100*((float)unkHits/unknown)); + fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f %%\n",unkHits,unknown,100*((float)unkHits/unknown)); fprintf(stderr,"* =========================================================================\n"); } - /******************************************************************/ -void eval::printTaggingSumary(int known,int unknown,int ambiguous,int well,int wellMFT) +void eval::printTaggingSumary(int known,int unknown,int ambiguous,int /*well*/,int wellMFT) { fprintf(stderr,"* ================= TAGGING SUMMARY =======================================\n"); fprintf(stderr,"#WORDS\t\t = %d\n",known+unknown); - fprintf(stderr,"#KNOWN\t\t = %d\t/\t%d\t--> (%.4f %)\n",known,known+unknown,100*((float)known/(known+unknown))); - fprintf(stderr,"#UNKNOWN\t = %d\t/\t%d\t--> (%.4f %)\n",unknown,known+unknown,100*((float)unknown/(known+unknown))); - fprintf(stderr,"#AMBIGUOUS\t = %d\t/\t%d\t--> (%.4f %)\n",ambiguous,known+unknown,100*((float)ambiguous/(known+unknown))); - fprintf(stderr,"#MFT baseline\t = %d\t/\t%d\t--> (%.4f %)\n",wellMFT,known+unknown,100*((float)wellMFT/(known+unknown))); + fprintf(stderr,"#KNOWN\t\t = %d\t/\t%d\t--> (%.4f %%)\n",known,known+unknown,100*((float)known/(known+unknown))); + fprintf(stderr,"#UNKNOWN\t = %d\t/\t%d\t--> (%.4f %%)\n",unknown,known+unknown,100*((float)unknown/(known+unknown))); + fprintf(stderr,"#AMBIGUOUS\t = %d\t/\t%d\t--> (%.4f %%)\n",ambiguous,known+unknown,100*((float)ambiguous/(known+unknown))); + fprintf(stderr,"#MFT baseline\t = %d\t/\t%d\t--> (%.4f %%)\n",wellMFT,known+unknown,100*((float)wellMFT/(known+unknown))); } - /******************************************************************/ void eval::printOverallAccuracy(int total, int well, int wellMFT, float pAmb) @@ -163,16 +151,15 @@ void eval::printOverallAccuracy(int total, int well, int wellMFT, float pAmb) fprintf(stderr,"* ================= OVERALL ACCURACY ======================================\n"); fprintf(stderr,"\tHITS\t\tTRIALS\t\tACCURACY\tMFT-baseline\n"); fprintf(stderr,"* -------------------------------------------------------------------------\n"); - fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f\t\t%.4f%\n",well,total,100*((float)well/total),100*((float)wellMFT/total)); + fprintf(stderr,"\t%d\t/\t%d\t=\t%.4f\t\t%.4f%%\n",well,total,100*((float)well/total),100*((float)wellMFT/total)); fprintf(stderr,"* =========================================================================\n"); - fprintf(stderr,"\tAmbiguity Average for Known words = %.5f POS/word\n",pAmb); + fprintf(stderr,"\tAmbiguity Average for Known words = %5f POS/word\n",pAmb); fprintf(stderr,"* =========================================================================\n"); } - /******************************************************************/ -void eval::printStatsByLevel(hash_t *h) +void eval::printStatsByLevel(hash_t *h) { fprintf(stderr,"* ================= ACCURACY PER LEVEL OF AMBIGUITY =======================\n"); fprintf(stderr,"#CLASSES = %d\n",numAmbLevel); @@ -180,10 +167,9 @@ void eval::printStatsByLevel(hash_t *h) printHashStats(h,FALSE,"LEVEL"); } - /******************************************************************/ -void eval::printStatsByAmbiguityClass(hash_t *h) +void eval::printStatsByAmbiguityClass(hash_t *h) { fprintf(stderr,"* ================= ACCURACY PER CLASS OF AMBIGUITY =======================\n"); fprintf(stderr,"#CLASSES = %d\n",numAmbClass); @@ -191,25 +177,22 @@ void eval::printStatsByAmbiguityClass(hash_t *h) printHashStats(h,TRUE,"CLASS"); } - /******************************************************************/ -void eval::printStatsByPOS(hash_t *h) +void eval::printStatsByPOS(hash_t *h) { fprintf(stderr,"* =================== ACCURACY PER PART-OF_SPEECH =========================\n"); printHashStats(h,FALSE,"POS"); } - /******************************************************************/ -void eval::addStatsToHash(hash_t *h,char *key,int is_hit, int is_mft) +void eval::addStatsToHash(hash_t* h, const std::string& key, int is_hit, int is_mft) { - uintptr_t p = hash_lookup(h,key); - if (p!=HASH_FAIL) + stat_t * s = h->hash_lookup(key); + if ((long)s!=HASH_FAIL) { - stat_t *s = (stat_t *)p; - strcpy(s->key,key); + s->key = key; if (is_hit==TRUE) s->hits++; if (is_mft==TRUE) s->mft++; s->trials++; @@ -218,23 +201,23 @@ void eval::addStatsToHash(hash_t *h,char *key,int is_hit, int is_mft) { if (report_type==3 || report_type==0) numAmbLevel++; if (report_type==4 || report_type==0) numAmbClass++; - stat_t *s = new stat_t; - strcpy(s->key,key); + s = new stat_t; + s->key = key; s->trials=1; s->hits=0; s->mft=0; if (is_hit==TRUE) s->hits++; if (is_mft==TRUE) s->mft++; - hash_insert(h,s->key,(uintptr_t) s); + h->hash_insert(s->key,s); } } - /******************************************************************/ void eval::makeReport(dictionary *d,FILE *gold, FILE *pred) { - char *mft,wrd1[150],wrd2[150],pos1[5],pos2[5]; + std::string mft; + char wrd1[150],wrd2[150],pos1[5],pos2[5]; int totalWords=0, well = 0,known=0,unknown=0,wellMFT=0; int ambiguous=0,unambiguous=0; int unkHits=0,knownHitsAmb=0,knownHitsUnamb=0; @@ -246,76 +229,69 @@ void eval::makeReport(dictionary *d,FILE *gold, FILE *pred) { is_mft=FALSE; is_hit=FALSE; - + char gold_line[250] = "\n"; char pred_line[250] = "\n"; - + while ( !feof(gold) && ( strcmp(gold_line,"\n") == 0 || ( gold_line[0]=='#' && gold_line[1]=='#') ) ) - fgets(gold_line,250,gold); + fgets(gold_line,250,gold); while ( !feof(pred) && ( strcmp(pred_line,"\n") == 0 || ( pred_line[0]=='#' && pred_line[1]=='#') ) ) - fgets(pred_line,250,pred); + fgets(pred_line,250,pred); ret1 = sscanf (gold_line,"%s %s",wrd1,pos1); ret2 = sscanf (pred_line,"%s %s",wrd2,pos2); - + if ( ret1 >= 0 && ret2 >= 0 ) { - int w = d->getElement(wrd1); + dataDict* w = d->getElement(wrd1); int numMaybe; - if (w!=HASH_FAIL) //Si es conocida + if ((long)w!=HASH_FAIL) //Si es conocida { known++; numMaybe = d->getElementNumMaybe(w); - - if (numMaybe>1) - ambiguous++; //Si es ambigua - //Si no es ambigua - else unambiguous++; + + if (numMaybe>1) + ambiguous++; //Si es ambigua + else unambiguous++; //Si no es ambigua contAmbiguities += numMaybe; - mft = d->getMFT(w); - if (strcmp(mft,pos1)==0) - { - is_mft = TRUE; - wellMFT++; + mft = d->getMFT(w)->pos; + if (mft == pos1) + { is_mft = TRUE; + wellMFT++; } - delete mft; } else unknown++; if (strcmp(wrd1,wrd2)==0 && strcmp(pos1,pos2)==0) { - well++; - is_hit=TRUE; //Es acierto - //Acierto para desconocidas - if (w==HASH_FAIL) unkHits++; - else if (numMaybe>1) - //Acierto para conocidas ambiguas - knownHitsAmb++; - //Acierto para conocidas no ambiguas - else knownHitsUnamb++; + well++; + is_hit=TRUE; //Es acierto + if ((long)w==HASH_FAIL) unkHits++; //Acierto para desconocidas + else if (numMaybe>1) + knownHitsAmb++; //Acierto para conocidas ambiguas + else knownHitsUnamb++; //Acierto para conocidas no ambiguas } if (report_type==3 || report_type==0) { - //Acumulamos por nivel de ambigedad - char level[4]; - if (w!=HASH_FAIL) sprintf(level,"%d",numMaybe); - else sprintf(level,"UNKOWN"); - addStatsToHash(&stat_Amb_Level,level,is_hit,is_mft); + //Acumulamos por nivel de ambigüedad + char level[4]; + if ((long)w!=HASH_FAIL) sprintf(level,"%d",numMaybe); + else sprintf(level,"UNKOWN"); + addStatsToHash(&stat_Amb_Level,level,is_hit,is_mft); } if (report_type==4 || report_type==0) { - //Acumulamos por clase de ambigedad - char *ambClass = d->getAmbiguityClass(w); - addStatsToHash(&stat_Class_Amb,ambClass,is_hit,is_mft); - delete ambClass; + //Acumulamos por clase de ambigüedad + std::string ambClass = d->getAmbiguityClass(w); + addStatsToHash(&stat_Class_Amb,ambClass,is_hit,is_mft); } if (report_type==5 || report_type==0) { - //Acumulamos por etiqueta - addStatsToHash(&stat_POS,pos2,is_hit,is_mft); + //Acumulamos por etiqueta + addStatsToHash(&stat_POS,pos2,is_hit,is_mft); } showProcessDone(totalWords,2000,FALSE,"words"); @@ -325,15 +301,15 @@ void eval::makeReport(dictionary *d,FILE *gold, FILE *pred) } - showProcessDone(totalWords,2000,TRUE,"words"); - printTaggingSumary(known,unknown,ambiguous,well,wellMFT); - if (report_type==2 || report_type==0) printKnownVsUnknown(ambiguous, unambiguous, unknown,unkHits,knownHitsAmb,knownHitsUnamb); - if (report_type==3 || report_type==0) printStatsByLevel(&stat_Amb_Level); - if (report_type==4 || report_type==0) printStatsByAmbiguityClass(&stat_Class_Amb); - if (report_type==5 || report_type==0) printStatsByPOS(&stat_POS); + showProcessDone(totalWords,2000,TRUE,"words"); + printTaggingSumary(known,unknown,ambiguous,well,wellMFT); + if (report_type==2 || report_type==0) printKnownVsUnknown(ambiguous, unambiguous, unknown,unkHits,knownHitsAmb,knownHitsUnamb); + if (report_type==3 || report_type==0) printStatsByLevel(&stat_Amb_Level); + if (report_type==4 || report_type==0) printStatsByAmbiguityClass(&stat_Class_Amb); + if (report_type==5 || report_type==0) printStatsByPOS(&stat_POS); - float porcentageAmbiguedad = (float)contAmbiguities/ (float) known; - printOverallAccuracy(unknown+known,well,wellMFT,porcentageAmbiguedad); + float porcentageAmbiguedad = (float)contAmbiguities/ (float) known; + printOverallAccuracy(unknown+known,well,wellMFT,porcentageAmbiguedad); } @@ -345,13 +321,13 @@ eval::eval(char *model, char *goldName, char *predName) numAmbLevel = 0; numAmbClass = 0; - hash_init(&stat_Amb_Level,10); - hash_init(&stat_Class_Amb,100); - hash_init(&stat_POS,100); - + stat_Amb_Level.hash_init(10); + stat_Class_Amb.hash_init(100); + stat_POS.hash_init(100); + strcpy(sModel, model); strcpy(sGold, goldName); - strcpy(sPred, predName); + strcpy(sPred, predName); } @@ -360,7 +336,6 @@ void eval::evalPutReportType(int report) report_type = report; } - void eval::evalRun() { fprintf(stderr,"* ========================= SVMTeval report ==============================\n"); @@ -374,11 +349,9 @@ void eval::evalRun() d = new dictionary(name); makeReport(d,gold,pred); - + delete d; } - - /************************************************************************/ int main(int argc, char *argv[]) @@ -386,12 +359,10 @@ int main(int argc, char *argv[]) int i = 0; int report_type; - verbose_svmtool = TRUE; - if (argc<4) { fprintf(stderr,"Waiting 3 or more parameters\n"); - printHelp(); + printHelp(argv[0]); exit(0); } if (argc>4) @@ -400,7 +371,7 @@ int main(int argc, char *argv[]) i=1; if (report_type>5 || report_type<0) { - printHelp(); + printHelp(argv[0]); exit(0); } } diff --git a/src/bin/SVMTlearn.cc b/src/bin/SVMTlearn.cc old mode 100644 new mode 100755 index adcb1aa..773c3b0 --- a/src/bin/SVMTlearn.cc +++ b/src/bin/SVMTlearn.cc @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -29,52 +29,49 @@ #include "er.h" #include "common.h" -extern int verbose_svmtool; +int verbose = FALSE; -void printHelp() +void printHelp(char *progname) { - fprintf(stderr,"\n\nSVMTool++ v 1.1.2 - SVMTLearner\n\n"); - fprintf(stderr,"\nUsage : SVMTlearn [options] "); + fprintf(stderr,"\n\nSVMTool++ v 1.1.6 - SVMTLearner\n\n"); + fprintf(stderr,"\nUsage : %s [options] ", progname); fprintf(stderr,"\noptions:\n"); fprintf(stderr,"\n\t-V or -v:\tverbose"); - fprintf(stderr,"\n\nExample: SVMTlearn -V config.svmt\n\n"); + fprintf(stderr,"\n\nExample: %s -V config.svmt\n\n", progname); } - int options(int argc,char *argv[]) { - if ( argc < 2 ) return -1; + if ( argc < 2 ) return -1; - for (int i=1;i -#include -#include -#include -#include "common.h" - -int verbose_svmtool = FALSE; - -/**************************************************/ - -/* - * FILE *openFile(const char *name, const char mode[]) - * Abre un fichero con el nombre y en el modo - * (r lectura, w escritura, a actualizacin ...). - * Devuelve el puntero al fichero - * En caso de no poder abrir un fichero, termina la ejecucion - */ -FILE *openFile(const char *name, const char mode[]) -{ - FILE *f; - if ((f = fopen(name, mode))== NULL) - { - fprintf(stderr, "Error opening file: %s\n",name); - exit(0); - } - return f; -} - - -/**************************************************/ - -void generateFileName(const char *name, const char *added,int numModel, int direction, int what, const char *type, char *out) -{ - strcpy(out,name); - if (strcmp(added,"")!=0 && added!=NULL) - { - sprintf(out,"%s.",out); - for (int i=0; i=0) sprintf(out,"%s.M%d",out,numModel); - if (direction==LEFT_TO_RIGHT) sprintf(out,"%s.LR",out); - else if (direction==RIGHT_TO_LEFT) sprintf(out,"%s.RL",out); - if (type!=NULL) sprintf(out,"%s.%s",out,type); -} - - -/**************************************************/ - -void showProcess(int num,int isEnd) -{ - if (isEnd) { fprintf(stderr,".%d sentences [DONE]\n\n",num); return; } - else if (num%100==0) fprintf(stderr,"%d",num); - else if (num%10==0) fprintf(stderr,"."); -} - - -/**************************************************/ - -void showProcessDone(int num,int freq, int isEnd, const char *what) -{ - if (isEnd) { fprintf(stderr,".%d %s [DONE]\n",num,what); return; } - else if (num%freq==0) fprintf(stderr,"."); -} - - -/**************************************************/ - -/* - * int goToWord(FILE *f, int offset) - * Lee lineas del canal o fichero - * Retorna -1 si encuentra eof - * retorna el numero de lineas leidas si todo va bien - */ -int goToWord(FILE *f, int offset) -{ - int cont=0; - - while (!feof(f) && cont, el String leido sera devuelto como el - * parametro de salida . Para leer el String se leera hasta encontrar - * el o el caracter - * Retorna 0 si encuentra - * retorna -1 si eof - * retorn 1 si todo va bien y encuentra - */ -int readTo(FILE *f, char endChar, char endLine, char *out) -{ - strcpy(out,""); - char c = endChar+1; - while (!feof(f) && c!=endChar && (endLine==0 || c!=endLine)) - { - c=fgetc(f); - if (c!=endChar && c!=endLine) sprintf(out,"%s%c",out,c); - } - if (feof(f)) return -1; - if (c==endLine) return 0; - return 1; -} - - -/*******************************************************/ - -void qsort(int a[], int lo, int hi) -{ - int h, l, p, t; - - if (lo < hi) - { - l = lo; - h = hi; - p = a[hi]; - - do - { - while ((l < h) && (a[l] <= p)) - l = l+1; - while ((h > l) && (a[h] >= p)) - h = h-1; - if (l < h) - { - t = a[l]; - a[l] = a[h]; - a[h] = t; - } - } while (l < h); - - t = a[l]; - a[l] = a[hi]; - a[hi] = t; - - qsort(a, lo, l-1); - qsort(a, l+1, hi); - } // if -} - - -/**************************************************/ - -void showTime(const char *what, double real, double utime, double stime) -{ - char message[200]=""; - sprintf(message,"%s: [ Real Time = %5.3lf secs.( %5.3lf usr + %5.3lf sys = %5.3lf CPU Time) ]\n",what,real,utime,stime,utime+stime); - fprintf(stderr,"%s",message); -} - - -/**************************************************/ - -int buscarMenorEnString(char *szIn,char *szMenor,int *iMenor) -{ - char szString[10],szTemp[strlen(szIn)+1]; - int iString; - - if (strcmp(szIn,"")==0 || szIn==NULL) return 1; - - strcpy(szTemp,szIn); - if (*iMenor==-1) - sscanf(szIn,"%s%d",szMenor,iMenor); - else - { - sscanf(szIn,"%s%d",szString,&iString); - if (strcmp(szString,szMenor)<0) - { - strcpy(szMenor,szString); - *iMenor = iString; - } - } - - int cont=0; - int i; - for (i=0; cont<2 && i(szInicial+strlen(szInicial))) return depth; - if (depth==0) strcpy(szOut,""); - - buscarMenorEnString(szIn,szMenor,&iMenor); - sprintf(szTempMenor,"%s %d",szMenor,iMenor); - p = strstr(szIn,szTempMenor); - - strcpy(szTemp,""); - // Copiamos string szIn sin pareja menor - while (i(p+strlen(szTempMenor))) - { - sprintf(szTemp,"%s%c",szTemp,szIn[i]); - } - i++; - } - - if (strlen(szOut)==0) sprintf(szOut,"%s %d",szMenor,iMenor); - else sprintf(szOut,"%s %s %d",szOut,szMenor,iMenor); - - return ordenarStringPorParejas(szTemp,szOut,depth+1,szInicial); -} - - -/**************************************************/ - -int obtainMark(FILE *channel,char *mark, int es_primera) -{ - int ret; - strcpy(mark,""); - while (strlen(mark)==0) ret = readTo(channel,'(','\n',mark); - - if (ret==-1) return -1; - else return ret; -} - - -/**************************************************/ - -int obtainAtrInt(FILE *channel,int *endAtr) -{ - int i=0; - char c=' ',num[5]=""; - - while ( (!feof(channel)) && (c!='(') && (c!=',') && (c!=')') ) - { - c=fgetc(channel); - if ((c!='(') && (c!=')')) num[i]=c; - i++; - } - if (c==')') *endAtr=1; - num[i]='\0'; - return atoi(num); -} - - -/**************************************************/ - -void destroyFeatureList(simpleList *fl,int nf) -{ - nodo_feature_list *data = NULL; - - fl->setFirst(); - if ( nf >= 1 ) //Si tiene mas de un maybe es ambigua - { - int ret = 1; - while (ret>=0) - { - data = (nodo_feature_list *) fl->getIndex(); - - data->l.setFirst(); - if (data->l.numElements() >= 1) - { - int ret2 = 1; - while (ret2>=0) - { - int *pInt = (int *) data->l.getIndex(); - delete pInt; - ret2 = data->l.next(); - } - data->l.setFirst(); - } - delete data; - ret=fl->next(); - } - fl->setFirst(); - } /* if */ -} - - -/**************************************************/ - -void createFeatureList(char *name,simpleList *featureList) -{ - int *i,endAtr,cont=0; - char c; - int ret = 1; - //char temp[100]; - nodo_feature_list *data; - - FILE *f; - if ((f = fopen(name, "rt"))== NULL) - { - fprintf(stderr, "Error opening file %s!!",name); - exit(0); - } - - //Insert feature Swn - data = new nodo_feature_list; - strcpy(data->mark,"Swn"); - data->n = 0; - featureList->add(data); - - char temp[10]; - ret = obtainMark(f,temp,TRUE); - while (ret!=-1) - { - data = new nodo_feature_list; - strcpy(data->mark,temp); - - endAtr=0; - cont=0; - - while (endAtr==0 && ret!=0) - { - i = new int; - *i = obtainAtrInt(f,&endAtr); - data->l.add(i); - cont++; - } - data->n = cont; - featureList->add(data); - strcpy(temp,""); - ret = obtainMark(f,temp,FALSE); - } - fclose(f); -} - - -/**************************************************/ - -void removeFiles(char *path, int type,int numModel, int direction, int verbose) -{ - char remove[200]; - switch (type) - { - case RM_TEMP_FILES: - if (verbose==TRUE) fprintf(stderr,"\nErasing temporal files.",numModel); - sprintf(remove,"rm -f %s.M%d*.SVM",path,numModel); - system(remove); - sprintf(remove,"rm -f %s*M%d*.POS",path,numModel); - system(remove); - sprintf(remove,"rm -f %s*M%d*.SAMPLES",path,numModel); - system(remove); - sprintf(remove,"rm -f %s*M%d*.MAP",path,numModel); - system(remove); - sprintf(remove,"rm -f %s*DICT.*",path); - system(remove); - break; - case RM_MODEL_FILES: - if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL) - { - if (verbose==TRUE) fprintf(stderr,"\nErasing old files for MODEL %d in LEFT TO RIGHT sense.",numModel); - sprintf(remove,"rm -f %s*M%d.LR.*",path,numModel); - system(remove); - } - if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL) - { - if (verbose==TRUE) - fprintf(stderr,"\nErasing old files for MODEL %d in RIGHT TO LEFT sense.",numModel); - sprintf(remove,"rm -f %s*M%d.RL.*",path,numModel); - system(remove); - } - sprintf(remove,"rm -f %s*A%d.*",path,numModel); - system(remove); - break; - } -} +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include "common.h" + +using namespace std; + +/**************************************************/ + +/* + * FILE *openFile(char *name, char mode[]) + * Abre un fichero con el nombre y en el modo + * (r lectura, w escritura, a actualización ...). + * Devuelve el puntero al fichero + * En caso de no poder abrir un fichero, termina la ejecucion + */ +FILE *openFile(const std::string &name, const char mode[]) +{ + FILE *f; + if ((f = fopen(name.c_str(), mode))== NULL) + { + fprintf(stderr, "Error opening file: %s\n",name.c_str()); + exit(0); + } + return f; +} + +/**************************************************/ + +void generateFileName(const std::string& name, const std::string& added, int numModel, int direction, int what, const std::string& type, std::string& out) +{ + out = name; + if (!added.empty()) + { + out += "."; + for (std::string::size_type i=0; i=0) out += ".M" + numModel; + if (direction==LEFT_TO_RIGHT) out += ".LR"; + else if (direction==RIGHT_TO_LEFT) out += ".RL"; + if (!type.empty()) out += "." + type; +} + +/**************************************************/ + +void showProcess(int num,int isEnd) +{ + if (isEnd) { fprintf(stderr,".%d sentences [DONE]\n\n",num); return; } + else if (num%100==0) fprintf(stderr,"%d",num); + else if (num%10==0) fprintf(stderr,"."); +} + +/**************************************************/ + +void showProcessDone(int num, int freq, int isEnd, const std::string& what) +{ + if (isEnd) { std::cerr <<"."< lineas del canal o fichero + * Retorna -1 si encuentra eof + * retorna el numero de lineas leidas si todo va bien + */ +int goToWord(FILE *f, int offset) +{ + int cont=0; + + while (!feof(f) && cont, el String leido sera devuelto como el + * parametro de salida . Para leer el String se leera hasta encontrar + * el o el caracter + * Retorna 0 si encuentra + * retorna -1 si eof + * retorn 1 si todo va bien y encuentra + */ +int readTo(FILE *f, char endChar, char endLine, std::string &out) +{ + out = ""; + char c = endChar+1; + while (!feof(f) && c!=endChar && (endLine==0 || c!=endLine)) + { + c=fgetc(f); + if (c!=endChar && c!=endLine) out += c; + } + if (feof(f)) return -1; + if (c==endLine) return 0; + return 1; +} + +/*******************************************************/ + +void qsort(int a[], int lo, int hi) { + int h, l, p, t; + + if (lo < hi) { + l = lo; + h = hi; + p = a[hi]; + + do { + while ((l < h) && (a[l] <= p)) + l = l+1; + while ((h > l) && (a[h] >= p)) + h = h-1; + if (l < h) { + t = a[l]; + a[l] = a[h]; + a[h] = t; + } + } while (l < h); + + t = a[l]; + a[l] = a[hi]; + a[hi] = t; + + qsort(a, lo, l-1); + qsort(a, l+1, hi); + } // if + } + +/**************************************************/ + +void showTime(const std::string& what, double real, double utime, double stime) +{ +// char message[200]=""; + std::cerr << what << ": [ Real Time = %5.3lf "<< real << " secs.( %5.3lf "< p_szTemp(strlen(szIn)+1); + char *szTemp = &p_szTemp[0]; + + int iString; + + if (strcmp(szIn,"")==0 || szIn==NULL) return 1; + + strcpy(szTemp,szIn); + if (*iMenor==-1) + sscanf(szIn,"%s%d",szMenor,iMenor); + else + { + sscanf(szIn,"%s%d",szString,&iString); + if (strcmp(szString,szMenor)<0) + { + strcpy(szMenor,szString); + *iMenor = iString; + } + } + + int cont=0; + unsigned int i = 0; + for (; cont<2 && i(szInicial+strlen(szInicial))) return depth; + if (depth==0) strcpy(szOut,""); + + buscarMenorEnString(szIn,szMenor,&iMenor); + sprintf(szTempMenor,"%s %d",szMenor,iMenor); + p = strstr(szIn,szTempMenor); + + // Copiamos string szIn sin pareja menor + unsigned int i = 0; + while (i(p+strlen(szTempMenor))) + { + szTemp += szIn[i]; + } + i++; + } + + if (strlen(szOut)==0) sprintf(szOut,"%s %d",szMenor,iMenor); + else sprintf(szOut,"%s %s %d",szOut,szMenor,iMenor); + + return ordenarStringPorParejas(szTemp.c_str(),szOut,depth+1,szInicial); +} + +/**************************************************/ + +int obtainMark(FILE *channel,std::string& mark) +{ + int ret; + mark = ""; + while (mark.empty()) ret = readTo(channel,'(','\n',mark); + + return ret; +} + +/**************************************************/ + +int obtainAtrInt(FILE *channel,int *endAtr) +{ + int i=0; + char c=' ',num[5]=""; + + while ( (!feof(channel)) && (c!='(') && (c!=',') && (c!=')') ) + { + c=fgetc(channel); + if ((c!='(') && (c!=')')) num[i]=c; + i++; + } + if (c==')') *endAtr=1; + num[i]='\0'; + return atoi(num); +} + +/**************************************************/ + + +void destroyFeatureList(simpleList *fl) +{ + nodo_feature_list *data = 0; + + fl->setFirst(); + for(int i = 0; i < fl->numElements(); i++, fl->next()) { + data = *fl->getIndex(); + data->l.setFirst(); + for(int j = 0; j < data->l.numElements(); j++, data->l.next()) { + delete *(data->l.getIndex()); + } + delete data; + } +} + +/**************************************************/ + +void createFeatureList(const std::string &name,simpleList *featureList) +{ + int *i,endAtr; +// char c; + int ret = 1; + //char temp[100]; + nodo_feature_list *data; + + FILE *f; + if ((f = fopen(name.c_str(), "rt"))== NULL) + { + fprintf(stderr, "Error opening file %s!!",name.c_str()); + exit(0); + } + + //Insert feature Swn + data = new nodo_feature_list; + data->mark = "Swn"; + featureList->add(data); + + std::string temp; + ret = obtainMark(f,temp); + while (ret!=-1) + { + data = new nodo_feature_list; + data->mark = temp; + + endAtr=0; + + while (endAtr==0 && ret!=0) + { + i = new int; + *i = obtainAtrInt(f,&endAtr); + data->l.add(i); + } + featureList->add(data); + temp.clear(); + ret = obtainMark(f,temp); + } + fclose(f); + +} + +/**************************************************/ + +void removeFiles(const std::string &path, int type,int numModel, int direction, int verbose) +{ + char szRemove[200]; + switch (type) + { + case RM_TEMP_FILES: + if (verbose==TRUE) fprintf(stderr,"\nErasing temporal files."); + /* + sprintf(remove,"rm -f %s.M%d*.SVM",path,numModel); + system(remove); + sprintf(remove,"rm -f %s*M%d*.POS",path,numModel); + system(remove); + sprintf(remove,"rm -f %s*M%d*.SAMPLES",path,numModel); + system(remove); + sprintf(remove,"rm -f %s*M%d*.MAP",path,numModel); + system(remove); + sprintf(remove,"rm -f %s*DICT.*",path); + system(remove); + */ + sprintf(szRemove,"%s.M%d*.SVM",path.c_str(),numModel); + remove(szRemove); + sprintf(szRemove,"%s*M%d*.POS",path.c_str(),numModel); + remove(szRemove); + sprintf(szRemove,"%s*M%d*.SAMPLES",path.c_str(),numModel); + remove(szRemove); + sprintf(szRemove,"%s*M%d*.MAP",path.c_str(),numModel); + remove(szRemove); + sprintf(szRemove,"%s*DICT.*",path.c_str()); + remove(szRemove); + break; + case RM_MODEL_FILES: + if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL) + { + if (verbose==TRUE) fprintf(stderr,"\nErasing old files for MODEL %d in LEFT TO RIGHT sense.",numModel); + //sprintf(szRemove,"rm -f %s*M%d.LR.*",path,numModel); + //system(szRemove); + sprintf(szRemove,"%s*M%d.LR.*",path.c_str(),numModel); + remove(szRemove); + } + if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL) + { + if (verbose==TRUE) + fprintf(stderr,"\nErasing old files for MODEL %d in RIGHT TO LEFT sense.",numModel); + //sprintf(szRemove,"rm -f %s*M%d.RL.*",path,numModel); + //system(szRemove); + sprintf(szRemove,"%s*M%d.RL.*",path.c_str(),numModel); + remove(szRemove); + } + //sprintf(szRemove,"rm -f %s*A%d.*",path,numModel); + //system(szRemove); + sprintf(szRemove,"%s*A%d.*",path.c_str(),numModel); + remove(szRemove); + break; + } +} + +void Tokenize(const string& str, vector& tokens, const string& delimiters = " ") +{ + // Skip delimiters at beginning. + string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (string::npos != pos || string::npos != lastPos) + { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } +} + diff --git a/src/dict.cc b/src/dict.cc old mode 100644 new mode 100755 dissimilarity index 81% index e37e731..e77d00f --- a/src/dict.cc +++ b/src/dict.cc @@ -1,740 +1,688 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include "hash.h" -#include "list.h" -#include "dict.h" -#include "swindow.h" -#include "common.h" -#include "er.h" - -extern int verbose_svmtool; - -/**************************************************/ - -char *dictionary::getMFT(int w) -{ - if (w==HASH_FAIL) return NULL; - - int max=0,ret = 1; - char *mft = new char[TAM_POS]; - simpleList *l = this->getElementMaybe(w); - infoDict *ptr; - - l->setFirst(); - while (ret>=0) - { - ptr = (infoDict *)l->getIndex(); - if (maxnum) - { - strcpy(mft,ptr->txt); - max = ptr->num; - } - ret = l->next(); - } - l->setFirst(); - return mft; -} - - -/**************************************************/ - -char *dictionary::getAmbiguityClass(int w) -{ - char *amb = new char[200]; - if (w==HASH_FAIL) - { - sprintf(amb,"UNKNOWN"); - return amb; - } - - int ret = 1; - strcpy(amb,""); - simpleList *l = this->getElementMaybe(w); - int numMaybe = this->getElementNumMaybe(w); - infoDict *ptr; - - l->setFirst(); - while (ret>=0) - { - numMaybe--; - ptr = (infoDict *)l->getIndex(); - // fprintf(stderr," %s %d",ptr->txt,ptr->num); - if (numMaybe>0) sprintf(amb,"%s%s_",amb,ptr->txt); - else sprintf(amb,"%s%s",amb,ptr->txt); - ret = l->next(); - } - l->setFirst(); - return amb; -} - - -/**************************************************/ - -void dictionary::dictIncInfo(dataDict *elem, char *pos) -{ - int ret=1; - infoDict *pInfoDict; - - elem->numWrd++; - while (ret>=0) - { - pInfoDict = (infoDict *) elem->maybe.getIndex(); - if (strcmp(pInfoDict->txt,pos)==0) - { - pInfoDict->num++; - elem->maybe.setFirst(); - return; - } - ret=elem->maybe.next(); - } - pInfoDict = new infoDict; - strcpy(pInfoDict->txt,pos); - pInfoDict->num=1; - elem->maybe.add(pInfoDict); - elem->numMaybe++; - elem->maybe.setFirst(); -} - - -/**************************************************/ - -void dictionary::dictWrite(char *outName) -{ - int ret=0; - infoDict *data; - dataDict *aux; - int cont=0,contWords=0; - char stringPOS[1000]; - - FILE *f = openFile(outName,"w"); - - hash_t *tptr = &d; - - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - for (i=0; inext; - - aux = (dataDict *) tmp->data; - //fprintf(stderr,"\n%s %d %d",aux->wrd,aux->numWrd,aux->numMaybe); - fprintf(f,"%s %d %d",aux->wrd,aux->numWrd,aux->numMaybe); - - cont++; - contWords = aux->numWrd+contWords; - ret = 1; - strcpy(stringPOS,""); - while (aux->numMaybe>0 && ret>=0) - { - data = (infoDict *) aux->maybe.getIndex(); - //fprintf(stderr," [ %s %d ]",data->txt,data->num); - - if (strlen(stringPOS)==0) sprintf(stringPOS,"%s %d",data->txt,data->num); - else sprintf(stringPOS,"%s %s %d",stringPOS,data->txt,data->num); - - ret=aux->maybe.next(); - } - - char *szOut = new char[strlen(stringPOS)+1]; - ordenarStringPorParejas(stringPOS, szOut, 0, stringPOS); - fprintf(f," %s\n",szOut); - - delete szOut; - } /* while */ - } /* for */ - - fclose(f); - return; -} - - -/**************************************************/ - -void dictionary::dictCreate(FILE *f,int limitInf,int limitSup) -{ - int retW=0,retP=0,contWords=0,cont=0,contWordsAdded=0; - infoDict *data; - dataDict *aux,*aux2; - nodo *elem; - char wrd[200],pos[10]; - int no_chunk = FALSE; - - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nCreating Dictionary"); - - if (limitInf == 0 && limitSup == 0) no_chunk = TRUE; - - while (retP>=0 && retW>=0) - { - if ( verbose_svmtool == TRUE) showProcessDone(contWordsAdded, 1000, FALSE,""); - retW = readString(f, wrd); - char *real = new char [strlen(wrd)+1]; - strcpy(real,wrd); - retP = readString(f, pos); - - if (retW>=0 && retP>=0) - { - int erRet=erLookRegExp(wrd); - switch (erRet) - { - case CARD: strcpy(wrd,"@CARD"); break; - case CARDSEPS: strcpy(wrd,"@CARDSEPS"); break; - case CARDPUNCT: strcpy(wrd,"@CARDPUNCT"); break; - case CARDSUFFIX: strcpy(wrd,"@CARDSUFFIX"); break; - } - - int is_valid_for_limit_inf = ( (contWords < limitInf) || contWords == 0 ); - int is_valid_for_limit_sup = (contWords >= limitSup ); - - if ( no_chunk == TRUE || is_valid_for_limit_inf || is_valid_for_limit_sup ) - { - if ((uintptr_t)(aux=(dataDict *)hash_lookup(&d,wrd)) == HASH_FAIL) - { - aux= new dataDict; - strcpy(aux->wrd,wrd); - aux->numMaybe = 1; - aux->numWrd = 1; - data = new infoDict; - strcpy(data->txt,pos); - data->num=1; - aux->maybe.add(data); - hash_insert(&d,aux->wrd,(uintptr_t) aux); - cont++; - } - else dictIncInfo(aux,pos); - contWordsAdded++; - - if (strcmp(wrd,"@CARD")==0 || strcmp(wrd,"@CARDPUNCT")==0 - || strcmp(wrd,"@CARDSEPS")==0 || strcmp(wrd,"@CARDSUFFIX")==0) - { - if ((uintptr_t)(aux2=(dataDict *)hash_lookup(&d,real)) == HASH_FAIL) - { - aux2 = new dataDict; - strcpy(aux2->wrd,real); - aux2->numMaybe = 1; - aux2->numWrd = 1; - data = new infoDict; - strcpy(data->txt,pos); - data->num = 1; - aux2->maybe.add(data); - hash_insert(&d,aux2->wrd,(uintptr_t) aux2); - cont++; - } - else dictIncInfo(aux2,pos); - } - } - } - contWords++; - delete real; - } - - if ( verbose_svmtool == TRUE ) fprintf(stderr,"[ %d words ]",cont); -} - - -/**************************************************/ - -void dictionary::dictRepairFromFile(char *fileName) -{ - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nReparing Dictionary with file < %s >",fileName); - FILE *f = openFile(fileName,"r"); - - char wrd[250],pos[10]; - int numWrd,numMaybe,numWrdxPOS; - dataDict *aux; - - // Bucle para leer lista de palabras - while (!feof(f)) - { - fscanf(f,"%s %d %d",wrd,&numWrd,&numMaybe); - int w = hash_lookup(&d,wrd); - if (w!=HASH_FAIL) - { - aux = new dataDict; - strcpy(aux->wrd,wrd); - aux->numWrd = getElementNumWord(w); - aux->numMaybe = 0; - - simpleList *l = getElementMaybe(w); - - for (int i=0;i=0) - { - infoDict *ptr = (infoDict *)l->getIndex(); - if (strcmp(pos,ptr->txt)==0) - { - //Copiamos elemento a aadir - infoDict *tmpInfoDict = new infoDict; - strcpy(tmpInfoDict->txt,ptr->txt); - tmpInfoDict->num = ptr->num; - - aux->maybe.add(tmpInfoDict); - aux->numMaybe++; - ret = -1; - //Borrar substituido - delete ptr; - } - else ret = l->next(); - - } - l->setFirst(); - } - - delete (dataDict *) hash_delete (&d,wrd); - hash_insert(&d,aux->wrd,(uintptr_t) aux); - } - } - fclose(f); -} - - -/**************************************************/ - -void dictionary::dictRepairHeuristic(float dratio) -{ - hash_t *tptr = &d; - hash_node_t *node, *last; - int i; - - for (i=0; isize; i++) - { - node = tptr->bucket[i]; - while (node != NULL) - { - last = node; - node = node->next; - - int ret=0; - dataDict *dd = (dataDict *)last->data; - - simpleList *l = &dd->maybe; - l->setFirst(); - int iNumWrdsAfterDelete = dd->numWrd; - while (ret>=0) - { - infoDict *ptr = (infoDict *)l->getIndex(); - - float fRange = (float) ptr->num / (float) dd->numWrd ; - - if (fRange < dratio) - { - dd->numMaybe--; - iNumWrdsAfterDelete = iNumWrdsAfterDelete - ptr->num; - //Eliminar pos - l->delIndex(); - } - ret = l->next(); - } - dd->numWrd = iNumWrdsAfterDelete; - l->setFirst(); - } - } -} - - -/**************************************************/ - -int dictionary::readInt(FILE *in) -{ - int i=0; - char value[10]; - char c=' '; - - strcpy(value,""); - - while ((c==' ') && (!feof(in))) c=fgetc(in); - while ((i<10) && (c!=' ') && (c!='\n') && (!feof(in))) - { - sprintf(value,"%s%c",value,c); - c=fgetc(in); i++; - } - return atoi(value); -} - - -/**************************************************/ - -infoDict *dictionary::readData(FILE *in) -{ - infoDict *data = new infoDict; - char c=fgetc(in); - int i = 0; - - strcpy(data->txt,""); - - while ( (itxt,"%s%c",data->txt,c); c=fgetc(in); i++; - } - data->num = readInt(in); - return data; -} - - -/**************************************************/ - -void dictionary::dictAddBackup(char *name) -{ - FILE *f = openFile(name,"r"); - - char wrd[250],pos[10]; - int ret,i; - dataDict *aux; - infoDict *data; - - // Bucle para leer lista de palabras - while (!feof(f)) - { - data = readData(f); - i = readInt(f); - uintptr_t w = hash_lookup(&d,wrd); - if (w==HASH_FAIL) - { - aux = new dataDict; - strcpy(aux->wrd,data->txt); - aux->numWrd = 0; - aux->numMaybe = 0; - } - else aux = (dataDict *) w; - - aux->numWrd += data->num; - delete data; - while (i>0) - { - data = readData(f); - ret=1; - //Buscamos si ja existe en la lista. - for (int j=aux->numMaybe;ret>=0 && j>0; j--) - { - infoDict *element = (infoDict *)aux->maybe.getIndex(); - if (strcmp(data->txt,element->txt)==0) - { - ret = -1; - element->num += data->num; - } - else ret = aux->maybe.next(); - } - //Si no encontrado lo aadimos a la lista - if (ret!=-1) - { - aux->maybe.add(data); - aux->numMaybe++; - } - else delete data; - i--; - } - if (w==HASH_FAIL) hash_insert(&d,aux->wrd,(uintptr_t) aux); - - } //End while not eof - fclose(f); -} - - -/**************************************************/ - -void dictionary::dictLoad(FILE *in) -{ - char c='\0'; - char wrd[25]=""; - int i=0,number; - dataDict *aux; - infoDict *data; - - while (!feof(in)) - { - data = readData(in); - i = readInt(in); - aux = new dataDict; - - strcpy(aux->wrd,data->txt); - - aux->numWrd = data->num; - aux->numMaybe = i; - delete data; - - //Si leemos una lnea con numMaybe = 0, es decir, sin - //ninguna etiqueta no la cargaremos. - if ( aux->numMaybe > 0) - { - while (i>0) - { - data = readData(in); - aux->maybe.add(data); - - i--; - } - hash_insert(&d,aux->wrd,(uintptr_t) aux); - } - else delete aux; - } -} - - -/**************************************************/ - -int dictionary::getElement(char *key) -{ - return hash_lookup(&d,key); -} - - -/**************************************************/ - -char *dictionary::getElementWord(uintptr_t ptr) -{ - dataDict *aux = (dataDict *) ptr; - return aux->wrd; -} - - -/**************************************************/ - -int dictionary::getElementNumWord(uintptr_t ptr) -{ - dataDict *aux = (dataDict *) ptr; - return aux->numWrd; -} - - -/**************************************************/ - -int dictionary::getElementNumMaybe(uintptr_t ptr) -{ - dataDict *aux = (dataDict *) ptr; - return aux->numMaybe; -} - - -/**************************************************/ - -simpleList *dictionary::getElementMaybe(uintptr_t ptr) -{ - dataDict *aux = (dataDict *) ptr; - return &aux->maybe; -} - - -/**************************************************/ - -dictionary::dictionary(char *name,char *backup) -{ - FILE *in = openFile(name,"r"); - hash_init(&d,1000); - dictLoad(in); - fclose(in); - dictAddBackup(backup); -} - - -/**************************************************/ - -dictionary::dictionary(char *name) -{ - FILE *in = openFile(name,"r"); - hash_init(&d,1000); - dictLoad(in); - fclose(in); -} - - -/**************************************************/ - -dictionary::dictionary(char *name,int limInf, int limSup) -{ - FILE *in = openFile(name,"r"); - char str[200]; - hash_init(&d,1000); - dictCreate(in,limInf,limSup); - fclose(in); -} - - -/**************************************************/ - -void dictionary::dictCleanListInfoDict(simpleList *l, int num) -{ - infoDict *data = NULL; - - l->setFirst(); - if ( num >= 1 ) //Si tiene mas de un maybe es ambigua - { - int ret = 1; - while (ret>=0) - { - data = (infoDict *) l->getIndex(); - delete data; - ret=l->next(); - } - l->setFirst(); - } /* if */ -} - - -/**************************************************/ - -dictionary::~dictionary() -{ - infoDict *data; - dataDict *aux; - - //Recorrer cada entrada del diccionario eliminando el contenido de las listas - hash_t *tptr = &d; - - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - for (i=0; inext; - - aux = (dataDict *) tmp->data; - - dictCleanListInfoDict(&aux->maybe,aux->numMaybe); - delete aux; - aux = NULL; - } /* while */ - } /* for */ - - //Destruir hashing - hash_destroy(&d); -} - - -/**************************************************/ - -hash_t *dictionary::dictFindAmbP(int *numPOS) -{ - int ret=0; - infoDict *data; - dataDict *aux; - - hash_t *ambp = new hash_t; - hash_t *tptr = &d; - hash_init(ambp,30); - - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - *numPOS = 0; - - for (i=0; inext; - - aux = (dataDict *) tmp->data; - aux->maybe.setFirst(); - if (aux->numMaybe>1) //Si tiene mas de un maybe es ambigua - { - ret = 1; - while (ret>=0) - { - data = (infoDict *) aux->maybe.getIndex(); - infoDict * tmp = new infoDict; - strcpy(tmp->txt,data->txt); - tmp->num = data->num; - hash_insert(ambp,tmp->txt,(uintptr_t) tmp); - - *numPOS++; - ret=aux->maybe.next(); - } - aux->maybe.setFirst(); - } /* if */ - } /* while */ - } /* for */ - return ambp; -} - - -/**************************************************/ - -hash_t *dictionary::dictFindUnkP(int *numPOS) -{ - int ret=0; - infoDict *data; - dataDict *aux; - - hash_t *unkp = new hash_t; - hash_t *tptr = &d; - hash_init(unkp,30); - - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - *numPOS = 0; - - for (i=0; inext; - - aux = (dataDict *) tmp->data; - aux->maybe.setFirst(); - if (aux->numWrd==1) //Si solo aparece una vez desconocida - { - ret = 1; - while (ret>=0) - { - data = (infoDict *) aux->maybe.getIndex(); - infoDict * tmp = new infoDict; - strcpy(tmp->txt,data->txt); - tmp->num = data->num; - hash_insert(unkp,tmp->txt,(uintptr_t) tmp); - *numPOS++; - ret=aux->maybe.next(); - } - aux->maybe.setFirst(); - } /* if */ - } /* while */ - } /* for */ - return unkp; -} +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "dict.h" + +#include "common.h" +#include "er.h" +#include "hash.h" +#include "list.h" +#include "swindow.h" + +#include +#include +#include +#include +using namespace std; + +extern int verbose; + +/**************************************************/ + +infoDict* dictionary::getMFT(dataDict* w) +{ + if ((long)w==HASH_FAIL) return NULL; + + int max=0; + bool ret = true; + simpleList* l = &this->getElementMaybe(w); + infoDict *ptr; + + l->setFirst(); + while (ret) + { + ptr = *l->getIndex(); + if (maxnum) + { + max = ptr->num; + } + ret = l->next(); + } + l->setFirst(); + return ptr; +} + +/**************************************************/ + +std::string dictionary::getAmbiguityClass(dataDict* w) +{ + char *amb = new char[200]; + if ((long)w==HASH_FAIL) + { + sprintf(amb,"UNKNOWN"); + return amb; + } + + bool ret = true; + strcpy(amb,""); + simpleList *l = &this->getElementMaybe(w); + int numMaybe = this->getElementNumMaybe(w); + infoDict *ptr; + + l->setFirst(); + while (ret) + { + numMaybe--; + ptr = *l->getIndex(); + // fprintf(stderr," %s %d",ptr->pos,ptr->num); + if (numMaybe>0) sprintf(amb,"%s%s_",amb,ptr->pos.c_str()); + else sprintf(amb,"%s%s",amb,ptr->pos.c_str()); + ret = l->next(); + } + l->setFirst(); + return amb; +} + +/**************************************************/ + +void dictionary::dictIncInfo(dataDict* elem, const std::string& pos) +{ + bool ret=true; + infoDict *pInfoDict; + + elem->numWrd++; + while (ret) + { + pInfoDict = *elem->maybe.getIndex(); + if (pInfoDict->pos == pos) + { + pInfoDict->num++; + elem->maybe.setFirst(); + return; + } + ret=elem->maybe.next(); + } + pInfoDict = new infoDict; + pInfoDict->pos = pos; + pInfoDict->num=1; + elem->maybe.add(pInfoDict); + elem->numMaybe++; + elem->maybe.setFirst(); +} + +/**************************************************/ + +void dictionary::dictWrite(const std::string& outName) +{ + infoDict *data; +// dataDict *aux; + int cont=0,contWords=0; + char stringPOS[1000]; + + FILE *f = openFile(outName.c_str(),"w"); + +// hash_node_t *old_hash, *tmp; +// int old_size, h, i; + + for (hash_t::iterator it = d.begin(); it != d.end(); it++) + { + dataDict *aux = (dataDict *) ((*it).second); + //fprintf(stderr,"\n%s %d %d",aux->wrd,aux->numWrd,aux->numMaybe); + fprintf(f,"%s %d %d",aux->wrd.c_str(),aux->numWrd,aux->numMaybe); + + cont++; + contWords = aux->numWrd+contWords; + bool ret=true; + strcpy(stringPOS,""); + while (aux->numMaybe>0 && ret) + { + data = *aux->maybe.getIndex(); + //fprintf(stderr," [ %s %d ]",data->pos,data->num); + + if (strlen(stringPOS)==0) sprintf(stringPOS,"%s %d",data->pos.c_str(),data->num); + else sprintf(stringPOS,"%s %s %d",stringPOS,data->pos.c_str(),data->num); + + ret=aux->maybe.next(); + } + + char *szOut = new char[strlen(stringPOS)+1]; + szOut[strlen(stringPOS)] = '\0'; + ordenarStringPorParejas(stringPOS, szOut, 0, stringPOS); + fprintf(f," %s\n",szOut); + + delete[] szOut; + } /* for */ + + fclose(f); + return; +} + +/**************************************************/ + +void dictionary::dictCreate(FILE *f,int limitInf,int limitSup) +{ + int retW=0,retP=0,contWords=0,cont=0,contWordsAdded=0; + infoDict *data; + dataDict *aux,*aux2; +// nodo *elem; + std::string wrd,pos; + int no_chunk = FALSE; + + if ( verbose == TRUE ) fprintf(stderr,"\nCreating Dictionary"); + + if (limitInf == 0 && limitSup == 0) no_chunk = TRUE; + + while (retP>=0 && retW>=0) + { + if ( verbose == TRUE) showProcessDone(contWordsAdded, 1000, FALSE,""); + retW = readString(f, wrd); + std::string real(wrd); + retP = readString(f, pos); + + //cout << retW << "_" << retP << " :: " << wrd << "_" << pos << " :: (" << real << ")\n"; + + if (retW>=0 && retP>=0) + { + int erRet=erLookRegExp(wrd); + switch (erRet) + { + case CARD: wrd = "@CARD"; break; + case CARDSEPS: wrd = "@CARDSEPS"; break; + case CARDPUNCT: wrd = "@CARDPUNCT"; break; + case CARDSUFFIX: wrd = "@CARDSUFFIX"; break; + } + + int is_valid_for_limit_inf = ( (contWords < limitInf) || contWords == 0 ); + int is_valid_for_limit_sup = (contWords >= limitSup ); + + if ( no_chunk == TRUE || is_valid_for_limit_inf || is_valid_for_limit_sup ) + { + if ((long)(aux=d.hash_lookup(wrd)) == HASH_FAIL) + { + aux= new dataDict; + aux->wrd = wrd; + aux->numMaybe = 1; + aux->numWrd = 1; + data = new infoDict; + data->pos = pos; + data->num=1; + aux->maybe.add(data); + d.hash_insert(aux->wrd,aux); + cont++; + } + else dictIncInfo(aux,pos); + contWordsAdded++; + + if (wrd =="@CARD" || wrd == "@CARDPUNCT" + || wrd == "@CARDSEPS" || wrd == "@CARDSUFFIX") + { + if ((long)(aux2=d.hash_lookup(real)) == HASH_FAIL) + { + aux2 = new dataDict; + aux2->wrd = real; + aux2->numMaybe = 1; + aux2->numWrd = 1; + data = new infoDict; + data->pos = pos; + data->num = 1; + aux2->maybe.add(data); + d.hash_insert(aux2->wrd,aux2); + cont++; + } + else dictIncInfo(aux2,pos); + } + } + } + contWords++; + } + + if ( verbose == TRUE ) fprintf(stderr,"[ %d words ]",cont); +} + +/**************************************************/ + +void dictionary::dictRepairFromFile(const std::string& fileName) +{ + if ( verbose == TRUE ) fprintf(stderr,"\nReparing Dictionary with file < %s >",fileName.c_str()); + FILE *f = openFile(fileName.c_str(),"r"); + + char wrd[250],pos[10]; + int numWrd,numMaybe,numWrdxPOS; + dataDict *aux; + + + // Bucle para leer lista de palabras + while (!feof(f)) + { + fscanf(f,"%s %d %d",wrd,&numWrd,&numMaybe); + dataDict* w = d.hash_lookup(wrd); + if ((long)w!=HASH_FAIL) + { + aux = new dataDict; + aux->wrd = wrd; + aux->numWrd = getElementNumWord(w); + aux->numMaybe = 0; + + simpleList *l = &getElementMaybe(w); + + for (int i=0;igetIndex(); + if (pos == ptr->pos) + { + //Copiamos elemento a añadir + infoDict *tmpInfoDict = new infoDict; + tmpInfoDict->pos = ptr->pos; + tmpInfoDict->num = ptr->num; + + aux->maybe.add(tmpInfoDict); + aux->numMaybe++; + ret = false; + delete ptr; //Borrar substituido + } + else ret = l->next(); + } + l->setFirst(); + } + + delete d.hash_delete(wrd); + d.hash_insert(aux->wrd,aux); + } + } + fclose(f); +} + +/**************************************************/ + +void dictionary::dictRepairHeuristic(float dratio) +{ + for (hash_t::iterator it = d.begin(); it != d.end(); it++) + { + dataDict *dd = (dataDict *)((*it).second); + + bool ret=true; + + simpleList *l = &dd->maybe; + l->setFirst(); + int iNumWrdsAfterDelete = dd->numWrd; + while (ret) + { + infoDict *ptr = *l->getIndex(); + + float fRange = (float) ptr->num / (float) dd->numWrd ; + + if (fRange < dratio) + { + dd->numMaybe--; + iNumWrdsAfterDelete = iNumWrdsAfterDelete - ptr->num; + l->delIndex(); //Eliminar pos + } + ret = l->next(); + } + dd->numWrd = iNumWrdsAfterDelete; + l->setFirst(); + } +} + +/**************************************************/ + +int dictionary::readInt(FILE *in) +{ int i=0; + char value[10]; + char c=' '; + + strcpy(value,""); + + while ((c==' ') && (!feof(in))) c=fgetc(in); + while ((i<10) && (c!=' ') && (c!='\n') && (!feof(in))) + { + sprintf(value,"%s%c",value,c); + c=fgetc(in); i++; + } + return atoi(value); +} + +/**************************************************/ + +infoDict *dictionary::readData(FILE *in) +{ + infoDict *data = new infoDict; + char c=fgetc(in); + int i = 0; + + data->pos = ""; + + while ( (ipos += c; + c=fgetc(in); + i++; + } + data->num = readInt(in); + return data; +} + +/**************************************************/ + +void dictionary::dictAddBackup(const std::string& name) +{ + FILE *f = openFile(name,"r"); + +// char wrd[250],pos[10]; + int i; + dataDict *aux; + infoDict *data; + + // Loop to read list of words + while (!feof(f)) + { + data = readData(f); + i = readInt(f); + long w = (long)d.hash_lookup(data->pos); + if (w==HASH_FAIL) + { + aux = new dataDict; + aux->wrd = data->pos; + aux->numWrd = 0; + aux->numMaybe = 0; + } + else aux = (dataDict *) w; + + aux->numWrd += data->num; + delete data; + while (i>0) + { + data = readData(f); + bool ret=true; + //If not found add it to the list + for (int j=aux->numMaybe;ret>=0 && j>0; j--) + { + infoDict *element = *aux->maybe.getIndex(); + if (data->pos == element->pos) + { + ret = false; + element->num += data->num; + } + else ret = aux->maybe.next(); + } + //If not found add it to the list + if (ret) + { + aux->maybe.add(data); + aux->numMaybe++; + } + else delete data; + i--; + } + if (w==HASH_FAIL) d.hash_insert(aux->wrd,aux); + + } //End while not eof + fclose(f); +} + +void dictionary::addBackupEntry(const std::string& token, const std::set& tags) +{ + dataDict* aux = d.hash_lookup(token); + if ((long)aux==HASH_FAIL) + { + aux = new dataDict; + aux->wrd = token; + aux->numWrd = 1; + aux->numMaybe = 0; + d.hash_insert(aux->wrd,aux); + } + + for (std::set::const_iterator it = tags.begin(); it != tags.end(); it++) + { + infoDict* data = new infoDict(); + data->pos = *it; + data->num = 0; + bool ret=true; + aux->maybe.setFirst(); + //If not found add it to the list + for (int j=aux->numMaybe;ret && j>0; j--) + { + infoDict *element = *aux->maybe.getIndex(); + if (data->pos == element->pos) + { + ret = false; + element->num += data->num; + } + else ret = aux->maybe.next(); + } + aux->maybe.setFirst(); + + //If not found add it to the list + if (ret) + { +// std::cerr << "add possible tag '"<pos<<"' for token '"<maybe.add(data); + aux->numMaybe++; + } + else delete data; + } +} + +/**************************************************/ + +void dictionary::dictLoad(FILE *in) +{ + while (!feof(in)) + { + infoDict *data = readData(in); + int i = readInt(in); + dataDict *aux = new dataDict; + + aux->wrd = data->pos; + aux->numWrd = data->num; + aux->numMaybe = i; + delete data; + + // If we read a line with numMaybe = 0, ie without any label, do not load + if ( aux->numMaybe > 0) + { + while (i>0) + { + data = readData(in); + aux->maybe.add(data); + + i--; + } + d.hash_insert(aux->wrd,aux); + } + else delete aux; + } +} + +/**************************************************/ + +dataDict* dictionary::getElement(const std::string& key) +{ + return d.hash_lookup(key); +} + +/**************************************************/ + +std::string& dictionary::getElementWord(dataDict* ptr) +{ + return ptr->wrd; +} + +/**************************************************/ + +int dictionary::getElementNumWord(dataDict* ptr) +{ + return ptr->numWrd; +} + +/**************************************************/ + +int dictionary::getElementNumMaybe(dataDict* ptr) +{ + return ptr->numMaybe; +} + +/**************************************************/ + +simpleList& dictionary::getElementMaybe(dataDict* ptr) +{ + return ptr->maybe; +} + +/**************************************************/ + +dictionary::dictionary(const std::string& name,const std::string& backup) +{ + FILE *in = openFile(name.c_str(),"r"); + d.hash_init(1000); + dictLoad(in); + fclose(in); + dictAddBackup(backup); +} + +/**************************************************/ + +dictionary::dictionary(const std::string& name) +{ + FILE *in = openFile(name.c_str(),"r"); + d.hash_init(1000); + dictLoad(in); + fclose(in); +} + +/**************************************************/ + +dictionary::dictionary(const std::string& name,int limInf, int limSup) +{ + FILE *in = openFile(name.c_str(),"r"); + //char str[200]; + d.hash_init(1000); + dictCreate(in,limInf,limSup); + fclose(in); +} + +/**************************************************/ + +void dictionary::dictCleanListInfoDict(simpleList* l, int num) +{ + infoDict *data = NULL; + + l->setFirst(); + if ( num >= 1 ) //Si tiene mas de un maybe es ambigua + { + bool ret = true; + while (ret) + { + data = *l->getIndex(); + delete data; + ret=l->next(); + } + l->setFirst(); + } /* if */ +} + +/**************************************************/ + +dictionary::~dictionary() +{ + //Recorrer cada entrada del diccionario eliminando el contenido de las listas + for (hash_t::iterator it = d.begin(); it != d.end(); it++) + { + dataDict *aux = (*it).second; + (*it).second = 0; + dictCleanListInfoDict(&aux->maybe,aux->numMaybe); + delete aux; + } /* for */ + + //Destruir hashing + d.clear(); +} + +/**************************************************/ + +hash_t *dictionary::dictFindAmbP(int *numPOS) +{ + infoDict *data; +// dataDict *aux; + + hash_t *ambp = new hash_t(); + ambp->hash_init(30); + + + for (hash_t::iterator it = d.begin(); it != d.end(); it++) + { + dataDict *aux = (*it).second; + +// hash_node_t *old_hash, *tmp; +// int old_size, h, i; + + *numPOS = 0; + + aux->maybe.setFirst(); + if (aux->numMaybe>1) //Si tiene mas de un maybe es ambigua + { + bool ret = true; + while (ret) + { + data = *aux->maybe.getIndex(); + infoDict * tmp = new infoDict; + tmp->pos = data->pos; + tmp->num = data->num; + ambp->hash_insert(tmp->pos,tmp); + + *numPOS++; + ret=aux->maybe.next(); + } + aux->maybe.setFirst(); + } /* if */ + } /* for */ + return ambp; +} + +/**************************************************/ + +hash_t *dictionary::dictFindUnkP(int *numPOS) +{ + infoDict *data; +// dataDict *aux; + + hash_t *unkp = new hash_t(); + unkp->hash_init(30); + + + for (hash_t::iterator it = d.begin(); it != d.end(); it++) + { + dataDict *aux = (*it).second; + + *numPOS = 0; + + aux->maybe.setFirst(); + if (aux->numWrd==1) //Si solo aparece una vez desconocida + { + bool ret = true; + while (ret) + { + data = *aux->maybe.getIndex(); + infoDict * tmp = new infoDict; + tmp->pos = data->pos; + tmp->num = data->num; + unkp->hash_insert(tmp->pos,tmp); + *numPOS++; + ret=aux->maybe.next(); + } + aux->maybe.setFirst(); + } /* if */ + } /* for */ + return unkp; +} + diff --git a/src/er.cc b/src/er.cc old mode 100644 new mode 100755 index e01901e..e7f4500 --- a/src/er.cc +++ b/src/er.cc @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -20,30 +20,30 @@ #include "er.h" /***************************************************************** - * Regular expressions - *****************************************************************/ +* Regular expressions +*****************************************************************/ regex_t erCard,erCardPunct,erCardSeps,erCardSuffix; regex_t erMultiWord,erContainNum,erStartCap,erStartLower,erStartNumber, -erAllUp,erAllLow,erContainCap,erContainCaps,erContainPeriod,erContainComma; + erAllUp,erAllLow,erContainCap,erContainCaps,erContainPeriod,erContainComma; /**************************************************/ -#define ER_STARTCAP "^[A-Z\307\321\301\311\315\323\332\300\310\314\322\331\304\313\317\326\334].*$" -#define ER_STARTLOWER "^[a-z\347\361\341\351\355\363\372\340\350\354\362\371\344\353\357\366\374].*$" -#define ER_STARTNUMBER "^[0-9].*$" -#define ER_ALLUP "^[A-Z\307\321\301\311\315\323\332\300\310\314\322\331\304\313\317\326\334]+$" -#define ER_ALLLOW "^[a-z\347\361\341\351\355\363\372\340\350\354\362\371\344\353\357\366\374]+$" -#define ER_CONTAINCAP "^.+[A-Z\307\321\301\311\315\323\332\300\310\314\322\331\304\313\317\326\334].*$" -#define ER_CONTAINCAPS "^.*[A-Z\307\321\301\311\315\323\332\300\310\314\322\331\304\313\317\326\334].*[A-Z\307\321\301\311\315\323\332\300\310\314\322\331\304\313\317\326\334].*$" +#define ER_STARTCAP "^[A-ZÇÑÁÉÍÓÚÀÈÌÒÙÄËÏÖÜ].*$" +#define ER_STARTLOWER "^[a-zçñáéíóúàèìòùäëïöü].*$" +#define ER_STARTNUMBER "^[0-9].*$" +#define ER_ALLUP "^[A-ZÇÑÁÉÍÓÚÀÈÌÒÙÄËÏÖÜ]+$" +#define ER_ALLLOW "^[a-zçñáéíóúàèìòùäëïöü]+$" +#define ER_CONTAINCAP "^.+[A-ZÇÑÁÉÍÓÚÀÈÌÒÙÄËÏÖÜ].*$" +#define ER_CONTAINCAPS "^.*[A-ZÇÑÁÉÍÓÚÀÈÌÒÙÄËÏÖÜ].*[A-ZÇÑÁÉÍÓÚÀÈÌÒÙÄËÏÖÜ].*$" #define ER_CONTAINPERIOD "^.*[.].*$" #define ER_CONTAINCOMMA "^.*[,].*$" -#define ER_CONTAINNUM "^.*[0-9].*$" -#define ER_MULTIWORD "^.*[-].*$" -#define ER_CARD "^[0-9]+$" -#define ER_CARDPUNCT "^[0-9]+[,!?:.]+$" -#define ER_CARDSEPS "^[0-9]+[-,:\\/.][0-9,:\\/.-]+$" -#define ER_CARDSUFFIX "^[0-9]+[^0-9]+.*$" +#define ER_CONTAINNUM "^.*[0-9].*$" +#define ER_MULTIWORD "^.*[-].*$" +#define ER_CARD "^[0-9]+$" +#define ER_CARDPUNCT "^[0-9]+[,!?:.]+$" +#define ER_CARDSEPS "^[0-9]+[-,:\\/.][0-9,:\\/.-]+$" +#define ER_CARDSUFFIX "^[0-9]+[^0-9]+.*$" /**************************************************/ @@ -67,7 +67,6 @@ void erCompRegExp() regcomp (&erMultiWord,ER_MULTIWORD,REG_EXTENDED); } - /**************************************************/ void erFreeRegExp() @@ -90,33 +89,29 @@ void erFreeRegExp() regfree(&erMultiWord); } - /**************************************************/ /* * return 1 if str is like the regular expression * in other case return 0 */ -int erLookRegExp2(void *er,char * str) +int erLookRegExp2(void *er,const std::string& str) { - int ret=0; - - if (!regexec ((regex_t *)er,str,0,NULL,0)) return 1; + if (!regexec ((regex_t *)er,str.c_str(),0,NULL,0)) return 1; return 0; } - /**************************************************/ -int erLookRegExp(char *m) +int erLookRegExp(const std::string& m) { int ret=-1; - if (!regexec (&erCardPunct,m,0,NULL,0)) ret=CARDPUNCT; - else if (!regexec (&erCardSeps,m,0,NULL,0)) ret=CARDSEPS; - else if (!regexec (&erCardSuffix,m,0,NULL,0)) ret=CARDSUFFIX; - else if (!regexec (&erCard,m,0,NULL,0)) ret=CARD; + if (!regexec (&erCardPunct,m.c_str(),0,NULL,0)) ret=CARDPUNCT; + else if (!regexec (&erCardSeps,m.c_str(),0,NULL,0)) ret=CARDSEPS; + else if (!regexec (&erCardSuffix,m.c_str(),0,NULL,0)) ret=CARDSUFFIX; + else if (!regexec (&erCard,m.c_str(),0,NULL,0)) ret=CARD; return ret; } diff --git a/src/hash.cc b/src/hash.cc deleted file mode 100644 index eac6e5b..0000000 --- a/src/hash.cc +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include "hash.h" - -/**************************************************/ - -#define HASH_LIMIT 0.5 - -/**************************************************/ - -/* - * hash() - Hash function returns a hash number for a given key. - * - * tptr: Pointer to a hash table - * key: The key to create a hash number for - */ -static int hash(const hash_t *tptr, const char *key) -{ - int i=0; - int hashvalue; - - if (key) while (*key != '\0') i=(i<<3)+(*key++ - '0'); - - hashvalue = (((i*1103515249)>>tptr->downshift) & tptr->mask); - if (hashvalue < 0) hashvalue = 0; - - return hashvalue; -} - - -/**************************************************/ - -/* - * rebuild_table() - Create new hash table when old one fills up. - * - * tptr: Pointer to a hash table - */ -void rebuild_table(hash_t *tptr) -{ - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - /* create a new table and rehash old buckets */ - hash_init(tptr, old_size<<1); - for (i=0; inext; - h=hash(tptr, tmp->key); - tmp->next=tptr->bucket[h]; - tptr->bucket[h]=tmp; - tptr->entries++; - } /* while */ - } /* for */ - - /* free memory used by old table */ - free(old_bucket); - - return; -} - - -/**************************************************/ - -/* - * hash_init() - Initialize a new hash table. - * - * tptr: Pointer to the hash table to initialize - * buckets: The number of initial buckets to create - */ -void hash_init(hash_t *tptr, int buckets) -{ - /* make sure we allocate something */ - if (buckets==0) buckets=16; - - /* initialize the table */ - tptr->entries=0; - tptr->size=2; - tptr->mask=1; - tptr->downshift=29; - - /* ensure buckets is a power of 2 */ - while (tptr->sizesize<<=1; - tptr->mask=(tptr->mask<<1)+1; - tptr->downshift--; - } /* while */ - - /* allocate memory for table */ - tptr->bucket=(hash_node_t **) calloc(tptr->size, sizeof(hash_node_t *)); - - return; -} - - -/**************************************************/ - -/* - * hash_lookup() - Lookup an entry in the hash table and return a pointer to - * it or HASH_FAIL if it wasn't found. - * - * tptr: Pointer to the hash table - * key: The key to lookup - */ -uintptr_t hash_lookup(const hash_t *tptr, const char *key) -{ - int h; - hash_node_t *node; - - /* find the entry in the hash table */ - h=hash(tptr, key); - for (node=tptr->bucket[h]; node!=NULL; node=node->next) - { - if (!strcmp(node->key, key)) break; - } - - /* return the entry if it exists, or HASH_FAIL */ - return(node ? node->data : HASH_FAIL); -} - - -/**************************************************/ - -/* - * hash_insert() - Insert an entry into the hash table. If the entry already - * exists return a pointer to it, otherwise return HASH_FAIL. - * - * tptr: A pointer to the hash table - * key: The key to insert into the hash table - * data: A pointer to the data to insert into the hash table - */ -uintptr_t hash_insert(hash_t *tptr, const char *key, uintptr_t data) -{ - uintptr_t tmp; - hash_node_t *node; - int h; - - /* check to see if the entry exists */ - if ((tmp=hash_lookup(tptr, key)) != HASH_FAIL) return(tmp); - - /* expand the table if needed */ - while (tptr->entries>=HASH_LIMIT*tptr->size) - rebuild_table(tptr); - - /* insert the new entry */ - h=hash(tptr, key); - node=(struct hash_node_t *) malloc(sizeof(hash_node_t)); - node->data=data; - node->key=key; - node->next=tptr->bucket[h]; - tptr->bucket[h]=node; - tptr->entries++; - - return HASH_FAIL; -} - - -/**************************************************/ - -/* - * hash_delete() - Remove an entry from a hash table and return a pointer - * to its data or HASH_FAIL if it wasn't found. - * - * tptr: A pointer to the hash table - * key: The key to remove from the hash table - */ -uintptr_t hash_delete(hash_t *tptr, const char *key) -{ - hash_node_t *node, *last; - uintptr_t data; - int h; - - /* find the node to remove */ - h=hash(tptr, key); - for (node=tptr->bucket[h]; node; node=node->next) - { - if (!strcmp(node->key, key)) break; - } - - /* Didn't find anything, return HASH_FAIL */ - if (node==NULL) return HASH_FAIL; - - /* if node is at head of bucket, we have it easy */ - if (node==tptr->bucket[h]) tptr->bucket[h]=node->next; - else - { - /* find the node before the node we want to remove */ - for (last=tptr->bucket[h]; last && last->next; last=last->next) - { - if (last->next==node) - break; - } - last->next=node->next; - } - - /* free memory and return the data */ - data=node->data; - free(node); - - return(data); -} - - -/**************************************************/ - -/* - * hash_destroy() - Delete the entire table, and all remaining entries. - */ -void hash_destroy(hash_t *tptr) -{ - hash_node_t *node, *last; - int i; - - for (i=0; isize; i++) - { - node = tptr->bucket[i]; - while (node != NULL) - { - last = node; - node = node->next; - free(last); - } - } - - /* free the entire array of buckets */ - if (tptr->bucket != NULL) - { - free(tptr->bucket); - memset(tptr, 0, sizeof(hash_t)); - } -} - - -/**************************************************/ - -/* - * alos() - Find the average length of search. - * - * tptr: Pointer to a hash table - */ -static float alos(hash_t *tptr) -{ - int i,j; - float alos=0; - hash_node_t *node; - - for (i=0; isize; i++) - { - for (node=tptr->bucket[i], j=0; node!=NULL; node=node->next, j++); - if (j) alos+=((j*(j+1))>>1); - } /* for */ - - return(tptr->entries ? alos/tptr->entries : 0); -} - - -/**************************************************/ - -/* - * hash_stats() - Return a string with stats about a hash table. - * - * tptr: A pointer to the hash table - */ -char * hash_stats(hash_t *tptr) -{ - static char buf[1024]; - - sprintf(buf, "%u slots, %u entries, and %1.2f ALOS",(int)tptr->size, (int)tptr->entries, alos(tptr)); - - return(buf); -} - - -/**************************************************/ - -/* - * hash_print() - Print Keys in FILE *f - * - */ -void hash_print(hash_t *tptr,FILE *f) -{ - hash_node_t *node, *last; - int i; - - for (i=0; isize; i++) - { - node = tptr->bucket[i]; - while (node != NULL) - { - last = node; - node = node->next; - fprintf(f,"%s\n",last->key); - } - } -} diff --git a/src/learner.cc b/src/learner.cc old mode 100644 new mode 100755 dissimilarity index 84% index a3d7f05..559c1a2 --- a/src/learner.cc +++ b/src/learner.cc @@ -1,1792 +1,1740 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include "marks.h" -#include "hash.h" -#include "list.h" -#include "dict.h" -#include "stack.h" -#include "swindow.h" -#include "mapping.h" -#include "weight.h" -#include "learner.h" -#include "common.h" - -/**************************************************************/ - -extern int verbose_svmtool; -double time_svmlight = 0; - -/**************************************************************/ - -stack_t DO; -int KERNEL=0; -int DEGREE=0; -float CK = 0; -float CU = 0; -float X = 3; -int MAX_MAPPING_SIZE = 100000; -int COUNT_CUT_OFF = 2; -int WINDOW_SIZE = 5; -int CORE_POSITION = 2; -char *TRAINSET = NULL; -char *SVMDIR = NULL; -char *NAME = NULL; -char *BLEX = NULL; -char *R = NULL; -float DRATIO = 0.001; -float ERATIO=0; -float KFILTER = 0; -float UFILTER = 0; -int REMOVE_FILES = TRUE; - -/**************************************************************/ - -char *UP = NULL; -char *AP = NULL; - -/**************************************************************/ - -//ambiguous-right [default] -const char *A0 = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\n"; -const char *A0UNK = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\na(2)\na(3)\na(4)\nz(2)\nz(3)\nz(4)\nca(1)\ncz(1)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"; - -/**************************************************************/ - -//unambiguous-right -const char *A1 = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(1)\np(2)\np(-2,-1)\np(-1,0)\np(-1,1)\np(0,1)\np(1,2)\np(-2,-1,0)\np(-2,-1,1)\np(-1,0,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\n"; -const char *A1UNK = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(1)\np(2)\np(-2,-1)\np(-1,0)\np(-1,1)\np(0,1)\np(1,2)\np(-2,-1,0)\np(-2,-1,1)\np(-1,0,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"; - -/**************************************************************/ - -//no-right -const char *A2 = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\nk(0)\nm(0)\n"; -const char *A2UNK = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\nk(0)\nm(0)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"; - -/**************************************************************/ - -//unsupervised-learning -const char *A3 = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2))\np(-2))\np(-1))\np(-2,-1))\np(-1,1))\np(1,2))\np(-2,-1,1))\np(-1,1,2))\nk(-2))\nk(-1))\nk(1))\nk(2)\n)\nm(-2))\nm(-1))\nm(1))\nm(2)\n"; -const char *A3UNK = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2))\np(-2))\np(-1))\np(-2,-1))\np(-1,1))\np(1,2))\np(-2,-1,1))\np(-1,1,2))\nk(-2))\nk(-1))\nk(1))\nk(2)\n)\nm(-2))\nm(-1))\nm(1))\nm(2)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"; - -/**************************************************************/ - -//ambiguous-right ++ unknown words on training -const char *A4 = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\n"; -const char *A4UNK = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"; - -/**************************************************************/ - -char *learner::read_feature_list_from_config_file(FILE *f, char *first_feature) -{ - char tmp[1000],str[500],*out; - strcpy(tmp,""); - sprintf(tmp,"%s\n",first_feature); - - int cont=1; - int ret=1; - - while (ret>0) - { - strcpy(str,""); - ret = readTo(f,' ','\n',str); - if (ret>=0) - { - sprintf(tmp,"%s%s\n",tmp,str); - cont++; - } - } - out = new char[strlen(tmp)+1]; - strcpy(out,tmp); - return out; -} - - -/**************************************************************/ - -void learner::read_config_file(const char *config_file) -{ - int ret1=1,ret2=1,ret3=1; - char str1[500],str2[500],str3[500]; - - FILE *f = openFile(config_file,"r"); - FILE *tmp = openFile("tmp_config.svmt","w+"); - char c,ant; - ant = '~'; - while (!feof(f)) - { - c = fgetc(f); - if (c=='#') readTo(f,'\n','\n',str1); - else if (!(c == ' ' && ant == ' ')) fprintf(tmp,"%c",c); - ant = c; - } - fclose(f); - - fseek(tmp,0,SEEK_SET); - while (!feof(tmp)) - { - strcpy(str1,""); strcpy(str2,""); strcpy(str3,""); - ret1 = readTo(tmp,' ','=',str1); - if (ret1>=0 && strcmp(str1,"do")==0) - { - ret2 = readTo(tmp,' ',' ',str2); - ret3 = readTo(tmp,' ','\n',str3); - char *p = strstr(str2,"M"); - int *modelo = new int; - int *direction = new int; - *modelo = atoi(&p[1]); - if (strcmp(str3,"LR")==0) *direction = LEFT_TO_RIGHT; - else if (strcmp(str3,"RL")==0) *direction = RIGHT_TO_LEFT; - else if (strcmp(str3,"LRL")==0) *direction = LR_AND_RL; - push(&DO,direction); - push(&DO,modelo); - } - else if (ret1>=0 && strcmp(str1,"")!=0) - { - ret2 = readTo(tmp,' ',' ',str2); - ret2 = readTo(tmp,' ','\n',str2); - if (ret1>=0 && ret2>=0) - { - if (strcmp(str1,"A0k")==0 || strcmp(str1,"A0")==0) A0 = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"A1k")==0 || strcmp(str1,"A1")==0) A1 = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"A2k")==0 || strcmp(str1,"A2")==0) A2 = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"A3k")==0 || strcmp(str1,"A3")==0) A3 = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"A4k")==0 || strcmp(str1,"A4")==0) A4 = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"A0u")==0 || strcmp(str1,"A0unk")==0) A0UNK = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"A1u")==0 || strcmp(str1,"A1unk")==0) A1UNK = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"A2u")==0 || strcmp(str1,"A2unk")==0) A2UNK = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"A3u")==0 || strcmp(str1,"A3unk")==0) A3UNK = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"A4u")==0 || strcmp(str1,"A4unk")==0) A4UNK = read_feature_list_from_config_file(tmp,str2); - else if (strcmp(str1,"F")==0 && ret3>=0) - { - ret3 = readTo(tmp,'\n','\n',str3); - MAX_MAPPING_SIZE = atoi(str3); - COUNT_CUT_OFF = atoi (str2); - } - else if (strcmp(str1,"W")==0) - { - ret3 = readTo(tmp,'\n','\n',str3); - WINDOW_SIZE = atoi(str2); - CORE_POSITION = atoi (str3); - } - else if (strcmp(str1,"TRAINSET")==0) - { - TRAINSET = new char[strlen(str2)+1]; - strcpy(TRAINSET,str2); - } - else if (strcmp(str1,"BLEX")==0) - { - BLEX = new char[strlen(str2)+1]; - strcpy(BLEX,str2); - } - else if (strcmp(str1,"R")==0) - { - R = new char[strlen(str2)+1]; - strcpy(R,str2); - } - else if (strcmp(str1,"SVMDIR")==0) - { - SVMDIR = new char[strlen(str2)+1]; - strcpy(SVMDIR,str2); - } - else if (strcmp(str1,"NAME")==0) - { - NAME = new char[strlen(str2)+1]; - strcpy(NAME,str2); - } - else if (strcmp(str1,"REMOVE_FILES")==0) - { - REMOVE_FILES = atoi (str2); - } - else if (strcmp(str1,"CK")==0) - { - CK = atof (str2); - } - else if (strcmp(str1,"CU")==0) - { - CU = atof (str2); - } - else if (strcmp(str1,"Dratio")==0) - { - DRATIO = atof (str2); - } - else if (strcmp(str1,"Eratio")==0) - { - ERATIO = atof (str2); - } - else if (strcmp(str1,"Kfilter")==0) - { - KFILTER = atof (str2); - } - else if (strcmp(str1,"Ufilter")==0) - { - UFILTER = atof (str2); - } - else if (strcmp(str1,"X")==0) - { - X = atof (str2); - } - else if (strcmp(str1,"AP")==0) - { - int ret = 1; - learnerAMBP_H = new hash_t; - hash_init(learnerAMBP_H,30); - - infoDict * etiq = new infoDict; - strcpy(etiq->txt,str2); - hash_insert(learnerAMBP_H,etiq->txt,(uintptr_t) etiq); - - if (ret2>0) - { - memset(str2,0,strlen(str2)); - fgets(str2,500,tmp); - - for (int i=0; itxt,str3); - hash_insert(learnerAMBP_H,etiq->txt,(uintptr_t) etiq); - } //EndFor - } //End if(ret2>0) - } - else if (strcmp(str1,"UP")==0) - { - int ret = 1; - learnerUNKP_H = new hash_t; - hash_init(learnerUNKP_H,30); - - infoDict * etiq = new infoDict; - strcpy(etiq->txt,str2); - hash_insert(learnerUNKP_H,etiq->txt,(uintptr_t) etiq); - if (ret2>0) - { - memset(str2,0,strlen(str2)); - fgets(str2,500,tmp); - - for (int i=0; ret2>0 && itxt,str3); - hash_insert(learnerUNKP_H,etiq->txt,(uintptr_t) etiq); - } //Endfor - } //End if(ret2>0) - } - } - } - } - fclose (tmp); - - if (verbose_svmtool==TRUE) - { - fprintf(stderr,"\n* ===================== SVMTlearn configuration =========================="); - fprintf(stderr,"\n* config file = [ %s ]\n* trainset = [ %s ]\n* model name = [ %s ]",config_file,TRAINSET,NAME); - fprintf(stderr,"\n* SVM-light dir = [ %s ]",SVMDIR); - fprintf(stderr,"\n* ========================================================================"); - fprintf(stderr,"\n* unknown words expected = [ X = %f % ]",X); - fprintf(stderr,"\n* C parameter for known = [ CK = %f ]",CK); - fprintf(stderr,"\n* C parameter for unknown = [ CU = %f ]",CU); - fprintf(stderr,"\n* D ratio = [ Dratio = %f ]",DRATIO); - fprintf(stderr,"\n* E ratio = [ Eratio = %f ]",ERATIO); - fprintf(stderr,"\n* Known weights filter = [ Kfilter = %f ]",KFILTER); - fprintf(stderr,"\n* Unknown weights filter = [ Ufilter = %f ]",UFILTER); - fprintf(stderr,"\n* sliding window settings = [ WINDOW SIZE = %d , CORE POSITION = %d ]",WINDOW_SIZE,CORE_POSITION); - fprintf(stderr,"\n* mapping settings = [ COUNT CUT OFF = %d , MAX MAPPING SIZE = %d ]",COUNT_CUT_OFF,MAX_MAPPING_SIZE); - fprintf(stderr,"\n* remove temporal files = [ %d ] (1) TRUE, (0) FALSE",REMOVE_FILES); - fprintf(stderr,"\n* ========================================================================"); - } - - system("rm -f tmp_config.svmt"); - - if (TRAINSET == NULL) { fprintf (stderr,"\nError: TRAINSET parameter not found in %s.\n",config_file); exit(-1); } - if (NAME == NULL) { fprintf (stderr,"\nError: MODEL NAME parameter not found in %s.\n",config_file); exit(-1); } - if (SVMDIR == NULL) { fprintf (stderr,"\nError: SVM DIRECTORY parameter not found in %s.\n",config_file); exit(-1); } -} - - -/**************************************************************/ - -learner::learner() -{ - learnerAMBP_H = NULL ; - learnerUNKP_H = NULL ; -} - - -learner::~learner() -{ - char str[1000]; - strcpy(str,""); - if (REMOVE_FILES==TRUE) - { - removeFiles(NAME,RM_TEMP_FILES,0,0,verbose_svmtool); - } - - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\n\nTERMINATION ... "); - - if (TRAINSET!=NULL) delete TRAINSET; - if (SVMDIR!=NULL) delete SVMDIR; - if (NAME!=NULL) delete NAME; - if (BLEX!=NULL) delete BLEX; - if (R!=NULL) delete R; - - if ( verbose_svmtool == TRUE ) fprintf(stderr,"[DONE]\n\n"); -} - - -/***************************************************************/ - -void learner::learnerCreatePOSFile(char *modelName, int is_ambp, hash_t *h) -{ - char name[300]; - if (is_ambp==TRUE) - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.AMBP",modelName); - sprintf(name,"%s.AMBP",modelName); - FILE *f = openFile (name,"w"); - hash_print(h,f); - fclose(f); - } - else - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.UNKP",modelName); - sprintf(name,"%s.UNKP",modelName); - FILE *f = openFile (name,"w"); - hash_print(h,f); - fclose(f); - } -} - - -/***************************************************************/ - -void learner::learnerCreateDefaultFile(const char *modelName, const char *str) -{ - char name[500]; - sprintf (name,"%s.%s",modelName,str); - FILE *f = openFile(name, "w"); - - if (strcmp(str,"A0")==0) - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A0",modelName); fprintf(f,"%s",A0); - sprintf (name,"%s.%s.UNK",modelName,str); - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A0.UNK",modelName); - FILE *funk = openFile(name, "w"); - fprintf(funk,"%s",A0UNK); - fclose(funk); - } - else if (strcmp(str,"A1")==0) - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A1",modelName); fprintf(f,"%s",A1); - sprintf (name,"%s.%s.UNK",modelName,str); - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A1.UNK",modelName); - FILE *funk = openFile(name, "w"); - fprintf(funk,"%s",A1UNK); - fclose(funk); - } - else if (strcmp(str,"A2")==0) - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A2",modelName); fprintf(f,"%s",A2); - sprintf (name,"%s.%s.UNK",modelName,str); - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A2.UNK",modelName); - FILE *funk = openFile(name, "w"); - fprintf(funk,"%s",A2UNK); - fclose(funk); - } - else if (strcmp(str,"A3")==0) - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A3",modelName); fprintf(f,"%s",A3); - sprintf (name,"%s.%s.UNK",modelName,str); - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A3.UNK",modelName); - FILE *funk = openFile(name, "w"); - fprintf(funk,"%s",A3UNK); - fclose(funk); - } - else if (strcmp(str,"A4")==0) - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A4",modelName); fprintf(f,"%s",A4); - sprintf (name,"%s.%s.UNK",modelName,str); - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.A4.UNK",modelName); - FILE *funk = openFile(name, "w"); - fprintf(funk,"%s",A4UNK); - fclose(funk); - } - else if (strcmp(str,"WIN")==0) - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring %s.WIN",modelName); - fprintf(f,"%d\n%d\n",WINDOW_SIZE,CORE_POSITION); - } - fclose(f); -} - - -/***************************************************************/ - -/* - * Return CHAR_NULL if end of file - * Return w,p,k (if we readed a) or s (if we readed m) if it's found - */ -char learner::obtainAtrChar(FILE *channel) -{ - char c; - while (!feof(channel)) - { - c=fgetc(channel); - if (c=='w' || c=='p' || c=='k' || c=='m') - { - fgetc(channel); - switch (c) - { - case 'k': return 'k'; - case 'm': return 's'; - default: return c; - } - } - } - return CHAR_NULL; -} - - -/**************************************************************/ - -/* - * Cada atributo de una lista de atributos tiene la forma - * () - * Usaremos esta funcin para leer cada uno de los enteros de - * la cadena que indica las posiciones de la ventana a considerar. - * Lee del fichero (channel) un entero entre '(' y coma, comas, - * coma y ') o parntesis. Devuelve el nmero ledo como entero. - */ -int learner::obtainAtrInt(FILE *channel,int *endAtr) -{ - int i=0; - char c=' ',num[5]=""; - - while ( (!feof(channel)) && (c!='(') && (c!=',') && (c!=')') ) - { - c=fgetc(channel); - if ((c!='(') && (c!=')')) num[i]=c; - i++; - } - if (c==')') *endAtr=1; - num[i]='\0'; - return atoi(num); -} - - -/***************************************************************/ - -/* int learnerCount(char *name, int *nWords, int *nSentences) - * Cuenta el numero de palabras y frases que tiene el corpus - * contenido en el fichero con nombre . El numero de - * palabras y de frases es devuelto como parametro de salida - * mediante y . - */ -void learner::learnerCount(char* name, int *nWords, int *nSentences) -{ - int ret1=0,ret2=0,cont=0,contWords=0; - char c=' ',str[300]; - - FILE *f = openFile(name, "r"); - while (!feof(f) && ret1>=0 && ret2>=0) - { - ret1 = readTo(f,' ',0,str); - if (strcmp(str,"!")==0 || strcmp(str,"?")==0 || strcmp(str,".")==0) cont++; - ret2 = readTo(f,'\n',0,str); - if ( !feof(f) && ret1 >= 0 ) contWords++; - } - fclose(f); - - if ( contWords <= 1 ) - { - fprintf(stderr,"\n\nInput corpus too short to begin training!! Program stopped.\n\n"); - exit(0); - } - - *nWords = contWords; - *nSentences = cont; -} - - -/********************************************************/ - -void learner::learnerPrintMessage(int numModel, int K_or_U, int LR_or_RL, int is_fex) -{ - if (verbose_svmtool==TRUE) - { - fprintf(stderr,"\n\n* ========================================================================"); - if (is_fex==FALSE) fprintf(stderr,"\n* TRAINING MODEL %d ",numModel); - else fprintf(stderr,"\n* FEATURES EXTRACTION FOR MODEL %d ",numModel); - if (is_fex==TRUE) fprintf(stderr,"[ KNOWN AND UNKNOWN WORDS - "); - else if (K_or_U==KNOWN) fprintf(stderr,"[ KNOWN WORDS - "); - else fprintf(stderr,"[ UNKNOWN WORDS - "); - if (LR_or_RL==LEFT_TO_RIGHT) fprintf(stderr,"LEFT TO RIGHT ]"); - else if (LR_or_RL==RIGHT_TO_LEFT) fprintf(stderr,"RIGHT TO LEFT ]"); - else if (LR_or_RL==LR_AND_RL)fprintf(stderr,"LEFT TO RIGHT AND RIGHT TO LEFT ]"); - fprintf(stderr,"\n* ========================================================================"); - } -} - - -/**************************************************************/ - -/* - * Parmetros: - * wrd char * Palabra de ejemplo que se est tratando. - * numModel int Modelo que estamos entrenando. - * direction int Direccin en la cual se realiza el entrenamiento - (Derecha a izquierda o izquierda a derecha). - * Known_or_Unknown int Si se est entrenando para palabras conocidas - o desconocidas. - * pos char * La etiqueta morfosintctica que estamos entrenando. - * samplePos char * Etiqueta morfosintctica del ejemplo que estamos tratando. - * features char * Lista de atributos generados para el ejemplo. - * d dictionary * Diccionario que se est usando para el entrenamiento. - * nNeg int * Apuntador al nmero de palabras seleccionadas como ejemplos negativos. - * nPos int * Apuntador al nmero de palabras seleccionadas como ejemplos positivos. - * - * Este mtodo puede usarse para seleccionar ejemplos para palabras conocidas - */ -void learner::learnerPushSample(char *wrd,int numModel,int direction, int Known_or_Unknown,char *pos, char *samplePos, char *features,dictionary *d, int *nNeg, int *nPos) -{ - char fileName[100]; - //Se abre el fichero donde se debe insertar el ejemplo. - generateFileName(NAME,pos,numModel,direction, Known_or_Unknown, "POS", fileName); - - FILE *f = openFile(fileName,"a+"); - - //Se obtiene la lista de posibles etiquetas morfosintcticas para - //la palabra wrd. - simpleList *l = learnerGetPotser(wrd,Known_or_Unknown,d); - l->setFirst(); - for (int stop=1; stop>=0 ; stop = l->next()) - { - //Se busca la etiqueta pos en la lista obtenida. - //Si se encuentra y es igual a samplePos se selecciona el ejemplo - //como positivo, si no es igual a samplePos se selecciona como - //negativo. - infoDict *pInfo = (infoDict *) l->getIndex(); - if (pInfo != NULL ) - { - if (strcmp(pInfo->txt,pos)==0) - { - if (strcmp(pInfo->txt,samplePos)==0) - { - //Positive Sample - *nPos=(*nPos)+1; - fprintf (f,"+1 %s\n",features); - } - else - { - //Negative Sample - *nNeg=(*nNeg)+1; - fprintf (f,"-1 %s\n",features); - } - } - } - } - - d->dictCleanListInfoDict(l,l->numElements()); - delete l; - //Se cierra el fichero. - fclose(f); -} - - -/**************************************************************/ -/* - * Parmetros: - * wrd Char * Palabra del ejemplo que se est tratando. - * numModel Int Modelo que estamos entrenando. - * direction Int Direccin en la cual se realiza el entrenamiento - (Derecha a izquierda o izquierda a derecha). - * Known_or_Unknown Int Si se est entrenando para palabras conocidas - o desconocidas. - * pos Char * La etiqueta morfosintctica que estamos entrenando. - * samplePos Char * Etiqueta morfosintctica del ejemplo que estamos tratando. - * features Char * Lista de atributos generadas para el ejemplo. - * d dictionary * Diccionario que se est usando para el entrenamiento. - * nNeg int * Apuntador al nmero de palabras seleccionadas como ejemplos negativos. - * nPos int * Apuntador al nmero de palabras seleccionadas como ejemplos positivos. - * Este mtodo se encarga de seleccionar ejemplos para palabras desconocidas. -*/ -void learner::learnerPushSampleUnk(char *wrd,int numModel,int direction, int Known_or_Unknown,char *pos, char *samplePos, char *features,dictionary *d, int *nNeg, int *nPos) -{ - char fileName[100]; - generateFileName(NAME,pos,numModel,direction, Known_or_Unknown, "POS", fileName); - //Abrimos el fichero - FILE *f = openFile(fileName,"a+"); - - //Si pos es igual a samplePos se selecciona el ejemplo como positivo, - //en cualquier otro caso se selecciona como negativo. - if (strcmp(samplePos,pos)==0) - { - //Positive Sample - *nPos=(*nPos)+1; - fprintf (f,"+1 %s\n",features); - } - else - { - //Negative Sample - *nNeg=(*nNeg)+1; - fprintf (f,"-1 %s\n",features); - } - //Cerramos el fichero - fclose(f); -} - - -/**************************************************************/ -/* - * Este mtodo recibe como parmetros un apuntador a un fichero (f) - * y el apuntador a un objeto del tipo mapping. El objetivo de este - * mtodo es leer los atributos generados para un ejemplo del fichero - * f y devolver un string con la lista de atributos en el formato - * esperado por SVM-light. - */ -char *learner::learnerCreateFeatureString(FILE *f,mapping *m) -{ - char *features = new char[1000]; - int array[learnerNumFeatures]; - int ret1=1,i = 0; - char str[100]; - - //Construimos un array de enteros con los - //identificadores numricos de cada atributo - while (ret1>0 && !feof(f)) - { - ret1 = readTo(f,' ','\n',str); - int num = m->mappingGetNumberByFeature(str); - if (ret1>=0 && num>-1) - { - array[i]=num; - i++; - } - } - - //qsort --> ordena ascendetemente un array de enteros - qsort(array,0,i-1); - sprintf(features,""); - - for (int j=0;j=0 && ret2>=0) - { - int isPossiblePOS = learnerIsPossiblePOS(wrd,pos,K_or_U); - if (isPossiblePOS==TRUE) - { - //Preparamos la lista de features - features = learnerCreateFeatureString(f,m); - - if (K_or_U==KNOWN) learnerPushSample(wrd,numModel,direction,K_or_U,pos,samplePos,features,d,nNeg,nPos); - else learnerPushSampleUnk(wrd,numModel,direction,K_or_U,pos,samplePos,features,d,nNeg,nPos); - - if ( verbose_svmtool == TRUE) showProcessDone(cont , 1000, FALSE,"samples"); - - delete features; - } - else readTo(f,'\n','\n',garbage); - cont++; - } //if - } //While -} - - -/**************************************************************/ - -/* - * Parmetros: - * f FILE* Apuntador al fichero de ejemplos - * numModel Int Modelo que estamos entrenando. - * LR_or_RL Int Direccin en la cual se realiza el entrenamiento - * (Derecha a izquierda o izquierda a derecha). - * K_or_U Int Si se est entrenando para palabras conocidas o desconocidas. - * d dictionary* Diccionario que se est usando para el entrenamiento. - * lPosToTrain simpleList * Lista de etiquetas morfosintcticas a entrenar - * - * Este mtodo es el encargado de realizar el aprendizaje. Prepara las opciones de - * ejecucin para SVM-light. Construye el mapping mediante los datos contenidos - * en el fichero de entrada. Crea un depsito de pesos (weightRepository) y - * un hashing en el que almacenar los sesgos obtenidos para cada etiqueta. - * Y prepara los datos para que puedan ser procesados por SVM-light. - * Para cada etiqueta morfosintctica de la lista lPosToTrain, se - * llama al mtodo learnerDressNakedSetTrain para conseguir los ficheros - * de entrada para SVM-light. Una vez hecho esto, se llama a learnerExecSVMlight, - * para ejecutar SVM-light. Con el fichero de salida generado por la herramienta - * de Joachims, se rellenan las estructuras de datos con los pesos - * (learnerBuiltWeightRepository) y los sesgos (learnerBuiltBias) - * Una vez procesadas todas las etiquetas morfosintcticas a entrenar, el depsito - * de pesos y el hashing de sesgos contienen todos los datos del modelo y se escriben - * en disco mediante los mtodos weightRepository.wrWriteHash para los sesgos y - * weightRepository.wrWrite para los pesos. - */ - -void learner::learnerDoLearn(FILE *f,int numModel,int LR_or_RL,int K_or_U,dictionary *d,simpleList *lPosToTrain) -{ - char posFileName[100],svmFileName[100],mapFileName[100]; - - //Preparamos las opciones con que se ejecutar svm-light - char options[100]=""; - if (CK!=0 && K_or_U==KNOWN) sprintf(options," -c %.6f ",CK); - else if (CU!=0 && K_or_U==UNKNOWN) sprintf(options," -c %.6f ",CU); - if (KERNEL!=0) sprintf(options,"%s -t %d ",options,KERNEL); - if (DEGREE!=0) sprintf(options,"%s -d %d ",options,DEGREE); - - learnerPrintMessage(numModel,K_or_U,LR_or_RL,FALSE); - - if ( verbose_svmtool == TRUE ) - { - fprintf(stderr,"\nBuilding MAPPING for MODEL %d [ ",numModel); - if (K_or_U==KNOWN) fprintf(stderr," KNOWN WORDS - "); - else fprintf(stderr," UNKNOWN WORDS - "); - if (LR_or_RL==LEFT_TO_RIGHT) fprintf(stderr,"LEFT TO RIGHT ]\n"); - else fprintf(stderr,"RIGHT_TO_LEFT ]"); - } - - //Construimos el mapping a partir de los ejemplos seleccionados - mapping *m = new mapping(); - m->mappingBuilt(f,MAX_MAPPING_SIZE,COUNT_CUT_OFF); - generateFileName(NAME,"",numModel,LR_or_RL,K_or_U,"MAP",mapFileName); - m->mappingWrite(mapFileName,FALSE); - - //Creamos el depsito de pesos y el hash de sesgos - hash_t *b = new hash_t; - hash_init(b,30); - weightRepository *wr = new weightRepository; - - infoDict *pInfo; - int nPositive=0,nNegative=0; - - //Nos situamos en el primer elemento de lista - lPosToTrain->setFirst(); - //Para cada elemento de la lista de etiquetas - for (int ret=1; ret>=0; ret=lPosToTrain->next()) - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\n-----------------------------------------------------------"); - - //Obtenemos la etiqueta morfosintctica - pInfo = (infoDict *)lPosToTrain->getIndex(); - nPositive=0; - nNegative=0; - - //Preparamos el entrenamiento - generateFileName(NAME,pInfo->txt,numModel,LR_or_RL,K_or_U,"POS",posFileName); - generateFileName(NAME,pInfo->txt,numModel,LR_or_RL,K_or_U,"SVM",svmFileName); - - //Seleccionamos los ejemplos para el entrenamiento para la POS que estamos viendo - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nPreparing training set for [ %s ] ..",pInfo->txt); - learnerDressNakedSetTrain(d,m,f,pInfo->txt,numModel,LR_or_RL, K_or_U,&nPositive,&nNegative); - - //Realizamos el entrenamiento llamando a SVM-light - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nTraining [ %s ] with %d samples: [+] = %d samples ; [-] = %d samples\n",pInfo->txt,nPositive+nNegative,nPositive,nNegative); - learnerExecSVMlight(SVMDIR,options,posFileName,svmFileName); - - //Se insertan los valores obtenidos del entrenamiento en el depsito de pesos - //y el el hashing de sesgos - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nAdding elements to MERGED MODEL from [ %s ]",posFileName); - wr = learnerBuiltWeightRepository(wr,m,pInfo->txt,svmFileName); - if ( verbose_svmtool == TRUE ) fprintf(stderr," [DONE]"); - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nAdding biases from [ %s ]",posFileName); - b = learnerBuiltBias(b,pInfo->txt,svmFileName); - if ( verbose_svmtool == TRUE ) fprintf(stderr," [DONE]"); - - if (REMOVE_FILES == TRUE) - { - char cmd[150]; - sprintf(cmd,"rm -f %s",posFileName); - system(cmd); - sprintf(cmd,"rm -f %s",svmFileName); - system(cmd); - } - } - lPosToTrain->setFirst(); - - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\n-----------------------------------------------------------"); - char fileName[100]; - generateFileName(NAME,"",numModel,LR_or_RL,K_or_U,"MRG",fileName); - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring MERGED MODEL [ %s ]",fileName); - //Modificacin 180705: Filtrado de pesos - //ADD 180705 - if ( K_or_U == KNOWN ) wr->wrWrite(fileName, KFILTER); - //ADD 180705 - else wr->wrWrite(fileName, UFILTER); - //Escribir deposito de pesos en disco - //wr->wrWrite(fileName); //DEL 180705 - if ( verbose_svmtool == TRUE ) fprintf(stderr," [DONE]"); - - FILE *fwr = openFile(fileName,"a+"); - fprintf (fwr,"BIASES "); - wr->wrWriteHash(b,fwr,' '); //Escribir biases en fichero de depsito de pesos - fclose(fwr); - - generateFileName(NAME,"",numModel,LR_or_RL,K_or_U,"B",fileName); - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nStoring BIASES [ %s ]",fileName); - FILE *fb =openFile(fileName,"w"); - wr->wrWriteHash(b,fb,'\n'); //Escribir biases en fichero de sesgos - fclose(fb); - - if ( verbose_svmtool == TRUE ) fprintf(stderr," [DONE]"); - - delete m; - delete wr; - learnerDestroyBias(b); -} - - -/*******************************************************/ -/* - * Recibe como parmetro el nombre del fichero de entrenamiento - * (trainingFileName), el diccionario para palabras conocidas (dKnown), - * el nmero del modelo a entrenar (numModel), la direccin en la cual - * se realiza el entrenamiento (direction), el nmero de frases del - * corpus (numSent), el nmero de palabras del corpus (numWords) y - * el nmero de fragmentos en que se ha de dividir el corpus para - * entrenar palabras desconocidas (numChunks). - * - * Este mtodo selecciona ejemplo y realiza el entrenamiento del modelo - */ -void learner::learnerTrainModel(char *trainingFileName, dictionary *dKnown,int numModel, int direction, int numSent,int numWords, int numChunks) -{ - FILE *fKnownRL,*fUnknownRL,*fUnknownLR,*fKnownLR; - int contSentences=0,ret = 1; - char name[200]; - - //Carga las listas de atributos - simpleList featureList,featureListUnk; - sprintf(name,"%s.A%d",NAME,numModel); - createFeatureList(name,&featureList); - sprintf(name,"%s.A%d.UNK",NAME,numModel); - createFeatureList(name,&featureListUnk); - - if (direction==LEFT_TO_RIGHT) learnerPrintMessage(numModel,-1,LEFT_TO_RIGHT,TRUE); - else if (direction==RIGHT_TO_LEFT) learnerPrintMessage(numModel,-1,RIGHT_TO_LEFT,TRUE); - else if (direction==LR_AND_RL) learnerPrintMessage(numModel,-1,LR_AND_RL,TRUE); - - //Abrimos los ficheros de ejemplos - if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL) - { - sprintf(name,"%s.M%d.RL.SAMPLES",NAME,numModel); - fKnownRL=openFile(name,"w+"); - sprintf(name,"%s.UNK.M%d.RL.SAMPLES",NAME,numModel); - fUnknownRL=openFile(name,"w+"); - } - if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL) - { - sprintf(name,"%s.M%d.LR.SAMPLES",NAME,numModel); - fKnownLR=openFile(name,"w+"); - sprintf(name,"%s.UNK.M%d.LR.SAMPLES",NAME,numModel); - fUnknownLR=openFile(name,"w+"); - } - - int chunkSize = (numWords/numChunks) + 1; - - if (verbose_svmtool == TRUE) - { - fprintf(stderr,"\n* X = %f :: CHUNKSIZE = %d :: CHUNKS = %d",X,chunkSize,numChunks); - fprintf(stderr,"\n* ========================================================================"); - } - - int nWordsRL = 0, nWordsLR = 0; - int inicioRL = -1, inicioLR = -1; - - //Para cada chunk - for (int i=0; (ret>=0 && idictRepairFromFile(R); - else dUnknown->dictRepairHeuristic(DRATIO); - char name[200]; - sprintf(name,"%s.DICT.%d",NAME,i); - dUnknown->dictWrite(name); - delete dUnknown; - dUnknown = new dictionary(name); - - //Si el modelo en 4 usaremos el mismo diccionario para conocidas y desconocidas - dictionary *d_for_known = NULL; - //if (numModel==4) dKnown = dUnknown; - if ( numModel == 4 ) - { d_for_known = dUnknown; } - else { d_for_known = dKnown; } - - if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nExtracting features : "); - - nWordsLR = chunkSize; - nWordsRL = chunkSize; - - //Mientras haya palabras por leer y no estemos al final del chunk - while ( ret>=0 && is_end_of_chunk==FALSE ) - { - is_end_of_sentence = FALSE; - //Si es LR o LRL - if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL) - { - fKnown = fKnownLR; fUnknown = fUnknownLR; - //Recorremos el texto en sentido LR para seleccionar ejemplos - nWordsLR = learnerLeftToRight(&featureList,&featureListUnk, d_for_known, dUnknown, nWordsLR, inicioLR); - inicioLR = sw->getIndex()->ord; - if (sw->next() != 0) - { - is_end_of_sentence = TRUE; - inicioLR = -1; - } - } - //Si es RL o LRL - if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL) - { - fKnown = fKnownRL; fUnknown = fUnknownRL; - //Recorremos el texto en sentido RL para seleccionar ejemplos - nWordsRL = learnerRightToLeft(&featureList,&featureListUnk, d_for_known , dUnknown, nWordsRL, inicioRL); - inicioRL = sw->getIndex()->ord; - if ( sw->previous() != 0 ) - { - is_end_of_sentence = TRUE; - inicioRL = -1; - } - } - - contSentences++; - - if ( verbose_svmtool == TRUE) showProcess(contSentences,0); - //Si es fin de frase recargamos la ventana - if ( is_end_of_sentence == TRUE ) - { - sw->deleteList(); - ret = sw->iniGeneric(); - } - - //Si hemos recorrido todas la palabras del chunk es fin de chunk - if ( nWordsRL <= 0 || nWordsLR <= 0 ) - { - is_end_of_chunk = TRUE; - } - } - if ( verbose_svmtool == TRUE) showProcess(contSentences,1); - delete dUnknown; - - //Si es necesario borramos los ficheros temporales - char cmd[200]; - sprintf(cmd, "rm -f %s",name); - if (REMOVE_FILES == TRUE) system(cmd); - } - - //Creamos una copia de la lista de etiquetas para palabras conocidas - simpleList *copyOfUNKP_L = learnerTransformHashInList(learnerUNKP_H); - //para cada tipo known o unk - //para cada direccion - if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL) - { - //Realizamos el entrenamiento - learnerDoLearn(fKnownLR,numModel,LEFT_TO_RIGHT,KNOWN,dKnown,learnerAMBP_L); - learnerDoLearn(fUnknownLR,numModel,LEFT_TO_RIGHT,UNKNOWN,dKnown,copyOfUNKP_L); - fclose (fKnownLR); - fclose (fUnknownLR); - } - - if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL) - { - //Realizamos el entrenamiento - learnerDoLearn(fKnownRL,numModel,RIGHT_TO_LEFT,KNOWN,dKnown,learnerAMBP_L); - learnerDoLearn(fUnknownRL,numModel,RIGHT_TO_LEFT,UNKNOWN,dKnown,copyOfUNKP_L); - fclose(fKnownRL); - fclose(fUnknownRL); - } - //fin para cada direccion - //fin para cada tipo - - destroyFeatureList(&featureList,featureList.numElements()); - destroyFeatureList(&featureListUnk,featureListUnk.numElements()); -} - - -/**********************************************************/ - -/* - * Recibe como parmetros: wrd un string con una palabra, - * pos un string con una etiqueta, Known_or_Unknown si la - * palabra es conocida o desconocida. Si la palabra es conocida - * devuelve TRUE si la etiqueta pos est en la lista de etiquetas - * morfosintcticas ambiguas. Si la palabra es desconocida - * devuelve TRUE si la etiqueta pos pertenece a lista de posibles - * categoras morfosintcticas para palabras desconocidas. En cualquier - * otro caso devuelve FALSE. - */ -int learner::learnerIsPossiblePOS(char *wrd, char *pos,int Known_or_Unknown) -{ - if (Known_or_Unknown==KNOWN) - { - if (HASH_FAIL != hash_lookup(learnerAMBP_H,pos)) - { - return TRUE; - } - } - else if (Known_or_Unknown==UNKNOWN) - { - if (HASH_FAIL != hash_lookup(learnerUNKP_H,pos)) - { - return TRUE; - } - } - return FALSE; -} - - -/**************************************************************/ -/* - * Recibe como parmetros: - * wrd un string con una palabra, - * Known_or_Unknown un entero que indica si la palabra - * es conocida o desconocida y - * un apuntador d al diccionario. Devuelve un apuntador a una lista. - */ -simpleList *learner::learnerGetPotser(char *wrd, int Known_or_Unknown, dictionary *d) -{ - int stop=FALSE,ret=1; - int w = d->getElement(wrd); - infoDict *pInfoDict; - simpleList *lout = new simpleList; - - //Si es conocida recoge las posibles etiquetas que contiene el diccionario - //para la palabra wrd y mira si existen en la lista de categoras - //morfosintcticas ambiguas. Si existen en la lista de etiquetas ambiguas - //las aade a la lista de salida, en caso contrario si no esta es posible - //etiqueta ambigua y es la etiqueta ms frecuente se devuelve una lista - //slo con la etiqueta ms frecuente. - if (Known_or_Unknown==KNOWN) - { - if (w != HASH_FAIL) - { - simpleList *list = (simpleList *) d->getElementMaybe(w); - list->setFirst(); - while (ret >= 0 && stop == FALSE) - { - pInfoDict = (infoDict *) list->getIndex(); - - if (HASH_FAIL!=hash_lookup(learnerAMBP_H,pInfoDict->txt)) - { - infoDict *ptr = new infoDict; - strcpy(ptr->txt,pInfoDict->txt); - ptr->num = pInfoDict->num; - lout->add(ptr); - } - //si es most frequent tag - else if ( strcmp(d->getMFT(w),pInfoDict->txt)==0 ) - { - d->dictCleanListInfoDict(lout,lout->numElements()); - lout->deleteList(); - infoDict *ptr = new infoDict; - strcpy(ptr->txt,pInfoDict->txt); - ptr->num = pInfoDict->num; - lout->add(ptr); - stop = TRUE; - } - ret=list->next(); - } - list->setFirst(); - } - } - //Si la palabra es desconocida la lista de salida contiene todas las - //categoras morfosintcticas posibles desconocidas. - else if (Known_or_Unknown==UNKNOWN) - { - learnerUNKP_L->setFirst(); - while (ret>=0) - { - pInfoDict =(infoDict *) learnerUNKP_L->getIndex(); - infoDict *ptr = new infoDict; - strcpy(ptr->txt,pInfoDict->txt); - ptr->num = pInfoDict->num; - lout->add(ptr); - - ret=learnerUNKP_L->next(); - } - learnerUNKP_L->setFirst(); - } - - return lout; -} - - -/**********************************************************/ -/* - * Recibe como parmetros: - * un apuntador al depsito de pesos (wr), - * un apuntador a mapping (m), - * un string con la etiqueta (pos) que estamos entrenando y - * el nombre del fichero (fileName) en el cual se almacenan las SVs - * para la etiqueta indicada. - * - * Partiendo de estos datos recorre el fichero fileName, traduciendo - * cada uno de los atributos ledos gracias al mapping (m). - * Y aadiendo la correspondiente pareja etiqueta/atributo - * al depsito de pesos wr mediante el mtodo wrAdd. - * - * Se devuelve el apuntador al depsito de pesos (weightRepository) - * que ha sido modificado. - */ -weightRepository *learner::learnerBuiltWeightRepository(weightRepository *wr,mapping *m,char *pos,char *fileName) -{ - char str[200]=""; - char *key; - FILE *f = openFile(fileName,"r"); - int ret=1,trobat=FALSE; - - while (!feof(f) && ret>=0) - { - if (trobat==FALSE) - { - ret= readTo(f,'\n',0,str); - if (strstr(str,"threshold")!=NULL) trobat=TRUE; - } - else - { - memset(str,0,strlen(str)); - ret = readTo(f,' ',0,str); - long double ld; - - ld = atof(str); - - while (ret>0) - { - ret = readTo(f,':','\n',str); - if (ret>0) - { - key = m->mappingGetFeatureByNumber(str); - ret = readTo(f,' ','\n',str); - if ((uintptr_t)key != HASH_FAIL) wr->wrAdd(key,pos,ld); - } - } - - } - } - fclose (f); - return wr; -} - - -/**************************************************************/ - -void learner::learnerDestroyBias(hash_t *h) -{ - weight_node_t *aux; - - hash_t *tptr = h; - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - //Recorre cada lista de sinnimos del hash eliminando su contenido - for (int i=0; inext; - - aux = (weight_node_t *) tmp->data; - - delete aux; - aux = NULL; - } /* while */ - } /* for */ - - hash_destroy(h); -} - - -/**************************************************************/ - -/* - * Recibe como parmetros: - * Un apuntador a hash_t (h) que es el lugar donde almacenaremos los sesgos, - * la etiqueta (pos) que estamos entrenando y el fichero (fileName) de salida - * de SVM-light en el cual se encuentran los datos esperados. - * Este mtodo lee el sesgo del fichero y lo aade (hash_insert) - * al hashing h para la etiqueta pos indicada. - * Posteriormente devuelve el apuntador al hashing modificado. - */ -hash_t *learner::learnerBuiltBias(hash_t *h,char *pos,char *fileName) -{ - char str[200]=""; - FILE *f = openFile(fileName ,"r"); - int ret=1,trobat=FALSE; - - while (!feof(f) && trobat==FALSE && ret>=0) - { - ret= readTo(f,'\n',0,str); - if (strstr(str,"threshold")!=NULL) trobat=TRUE; - } - - char bias[200]; - int sortir = FALSE; - strcpy(bias,""); - for (int i=0;(ipos,pos); - w->data = (long double)0; - w->data = atof (bias); - hash_insert(h,w->pos,(uintptr_t)w); - - fclose(f); - return h; -} - - -/********************************************************/ - -/* - * Recibe como parmetro el nombre del fichero de configuracin (train). - * Ejecuta el aprendizaje - */ -void learner::learnerRun(char *train) -{ - int iWrd=0,iSent=0; - - init_stack(&DO); - //Leemos el fichero de configuracin - read_config_file(train); - //obtener tamao del corpus - learnerCount(TRAINSET,&iWrd,&iSent); - - if ( verbose_svmtool == TRUE ) - { - fprintf(stderr,"\n* trainset # words = [ %d ]",iWrd); - fprintf(stderr,"\n* trainset # sentences = [ %d ]",iSent); - fprintf(stderr,"\n* ========================================================================"); - fprintf(stderr,"\n\n* ========================================================================"); - fprintf(stderr,"\n* PREPARING TRAINING"); - fprintf(stderr,"\n* ========================================================================"); - } - struct tms tbuff1,tbuff2; - clock_t start,end; - start = times(&tbuff1); - - //Creamos el diccionario - char name[200]; - sprintf(name,"%s.DICT",NAME); - dictionary *d = new dictionary(TRAINSET, 0,0); - // "/mnt/hda4/pfc/WSJ.200"); - if (R!=NULL) d->dictRepairFromFile(R); - else d->dictRepairHeuristic(DRATIO); - d->dictWrite(name); - delete d; - d = new dictionary(name); - - //Obtenemos las listas de etiquetas - if (learnerAMBP_H == NULL) - learnerAMBP_H = d->dictFindAmbP(&learnerNumAMBP); - if (learnerUNKP_H == NULL) - learnerUNKP_H = d->dictFindUnkP(&learnerNumUNKP); - //Creamos los ficheros de etiquetas - learnerCreatePOSFile(NAME,TRUE,learnerAMBP_H); - learnerCreatePOSFile(NAME,FALSE,learnerUNKP_H); - //Creamos unos hashings con las listas de etiquetas - learnerAMBP_L = learnerTransformHashInList(learnerAMBP_H); - learnerUNKP_L = learnerTransformHashInList(learnerUNKP_H); - - //creamos el fichero de configuracin de la ventana - learnerCreateDefaultFile(NAME,"WIN"); - - //Calculamos en numero de chunks - int chunks = learnerNumChunks(TRAINSET,X,iSent); - //Mientras haya modelos por entrenar - while (!empty(&DO)) - { - int *numModel = (int *) pop(&DO); - int *direction = (int *) pop(&DO); - - //Eliminamos los ficheros anteriores - removeFiles(NAME, RM_MODEL_FILES,*numModel, *direction, verbose_svmtool); - removeFiles(NAME, RM_TEMP_FILES ,*numModel, *direction, verbose_svmtool); - - sprintf(name,"A%d",*numModel); - learnerCreateDefaultFile(NAME,name); - - //Creamos la ventana - FILE *f = fopen(TRAINSET, "r"); - sw = new swindow(f,WINDOW_SIZE,CORE_POSITION); - //ejecutamos el entrenamiento para el modelo - learnerTrainModel(TRAINSET,d,*numModel,*direction,iSent,iWrd,chunks); - delete sw; - } - delete d; - - end = times(&tbuff2); - if ( verbose_svmtool == TRUE ) - { - fprintf(stderr,"\n\n* ========================================================================\n"); - showTime ("* SVM-light Time", time_svmlight, time_svmlight, 0); - showTime ("* SVMTlearn Time", - //CLK_TCK, - ((double)(end-start))/CLOCKS_PER_SECOND - time_svmlight, - ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime)/CLOCKS_PER_SECOND, - ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime)/CLOCKS_PER_SECOND); - showTime ("* Total Learning Time", - time_svmlight + ((double)(end-start))/CLOCKS_PER_SECOND, - time_svmlight + ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime)/CLOCKS_PER_SECOND, - ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime)/CLOCKS_PER_SECOND); - fprintf(stderr,"* ========================================================================\n\n"); - } -} - - -/**************************************************************/ - -/* - * Recorre el texto de izquierda a derecha seleccionando ejemplos - */ -int learner::learnerLeftToRight(simpleList *featureList, simpleList *featureListUnk, dictionary *dKnown, dictionary *dUnknown, int numWrds, int inicio) -{ - int ret = 0; - - if ( inicio == -1 ) while (sw->previous()==0); - else if (sw->getIndex()->ord!=inicio) - { - while(ret == 0 && sw->getIndex()->ord != inicio) - { - if (inicio < sw->getIndex()->ord ) ret = sw->previous(); - if (inicio > sw->getIndex()->ord ) ret = sw->next(); - } - } - nodo *elem = sw->getIndex(); - numWrds--; - learnerGenerateFeatures(elem,featureList,dKnown, LEFT_TO_RIGHT); - learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown, dUnknown, LEFT_TO_RIGHT); - - while(numWrds>=0) - { - if ( sw->next() != 0 ) return numWrds; - - elem = sw->getIndex(); - numWrds--; - learnerGenerateFeatures(elem,featureList,dKnown,LEFT_TO_RIGHT); - learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown, dUnknown, LEFT_TO_RIGHT); - } - - return (numWrds); -} - - -/**************************************************************/ - -/* - * Recorre el texto de derecha a izquierda seleccionando ejemplos - */ -int learner::learnerRightToLeft(simpleList *featureList, simpleList *featureListUnk, dictionary *dKnown, dictionary *dUnknown, int numWrds, int inicio) -{ - int ret = 0; - - if ( inicio == -1 ) while (sw->next()==0); - else if ( sw->getIndex()->ord != inicio ) - { - while(ret == 0 && sw->getIndex()->ord != inicio) - { - if (inicio < sw->getIndex()->ord ) ret = sw->previous(); - if (inicio > sw->getIndex()->ord ) ret = sw->next(); - } - } - - nodo *elem = sw->getIndex(); - numWrds--; - learnerGenerateFeatures(elem,featureList,dKnown,RIGHT_TO_LEFT); - learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown,dUnknown,RIGHT_TO_LEFT); - - while( numWrds>=0 ) - { - if ( sw->previous() != 0 ) return numWrds; - - elem = sw->getIndex(); - numWrds--; - learnerGenerateFeatures(elem,featureList,dKnown, RIGHT_TO_LEFT); - learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown, dUnknown, RIGHT_TO_LEFT); - } - - return numWrds; -} - - -/**************************************************************/ -/* - * Esta funcin recibe como parmetros: - * el apuntador a un nodo de la ventana (elem), - * una pila donde apilara los atributos generados (stk), - * la lista de atributos que debe generar (featureList), - * el diccionario con la informacin necesaria para el clculo de features (d) - * y la direccin en que se recorre el corpus (direction). - * Recorre la lista featureList y ejecuta los mtodos necesarios - * de la ventana (swindow) para generar los atributos y que al final - * de la ejecucin de este mtodo esten apilados en stk. - */ -void learner::learnerGetFeatures(nodo *elem, stack_t *stk,dictionary *d, simpleList *featureList, int direction) -{ - nodo_feature_list *aux = NULL; - int ret = 1; - //Recorre la lista de atributos y crea los atributos correspondientes - while (ret>=0) - { - aux = (nodo_feature_list *) featureList->getIndex(); - if (strcmp(aux->mark,SLASTW)==0) sw->winPushSwnFeature(stk); - else if (strcmp(aux->mark,WMARK)==0) sw->winPushWordFeature((void *)aux,d,stk,direction); - else if (strcmp(aux->mark,KMARK)==0) sw->winPushAmbiguityFeature((void *)aux,d,stk,direction); - else if (strcmp(aux->mark,MMARK)==0) sw->winPushMaybeFeature((void *)aux,d,stk,direction); - else if (strcmp(aux->mark,PMARK)==0) sw->winPushPosFeature((void *)aux,d,stk,direction); - else if (strcmp(aux->mark,MFTMARK)==0) sw->winPushMFTFeature((void *)aux,d,stk,direction); - else - { - int *param; - if (aux->n>0) - { - param = (int *) aux->l.getIndex(); - } - if (strcmp(aux->mark,PREFIX_MARK)==0) sw->winPushPrefixFeature(elem->wrd, stk, *param); - else if (strcmp(aux->mark,SUFFIX_MARK)==0) sw->winPushSuffixFeature(elem->wrd, stk, *param); - else if (strcmp(aux->mark,CHAR_A_MARK)==0) sw->winPushLetterFeature(elem->wrd, stk, *param, COUNTING_FROM_BEGIN); - else if (strcmp(aux->mark,CHAR_Z_MARK)==0) sw->winPushLetterFeature(elem->wrd, stk, *param, COUNTING_FROM_END); - else if (strcmp(aux->mark,LENGTH_MARK)==0) sw->winPushLenghtFeature(elem->wrd,stk); - else if (strcmp(aux->mark,START_CAPITAL_MARK)==0) sw->winPushStartWithCapFeature(elem->wrd,stk); - else if (strcmp(aux->mark,START_LOWER_MARK)==0) sw->winPushStartWithLowerFeature(elem->wrd,stk); - else if (strcmp(aux->mark,START_NUMBER_MARK)==0) sw->winPushStartWithNumberFeature(elem->wrd,stk); - else if (strcmp(aux->mark,ALL_UPPER_MARK)==0) sw->winPushAllUpFeature(elem->wrd,stk); - else if (strcmp(aux->mark,ALL_LOWER_MARK)==0) sw->winPushAllLowFeature(elem->wrd,stk); - else if (strcmp(aux->mark,CONTAIN_CAP_MARK)==0) sw->winPushContainCapFeature(elem->wrd, stk); - else if (strcmp(aux->mark,CONTAIN_CAPS_MARK)==0) sw->winPushContainCapsFeature(elem->wrd, stk); - else if (strcmp(aux->mark,CONTAIN_COMMA_MARK)==0) sw->winPushContainCommaFeature(elem->wrd, stk); - else if (strcmp(aux->mark,CONTAIN_NUMBER_MARK)==0) sw->winPushContainNumFeature(elem->wrd, stk); - else if (strcmp(aux->mark,CONTAIN_PERIOD_MARK)==0) sw->winPushContainPeriodFeature(elem->wrd, stk); - else if (strcmp(aux->mark,MULTIWORD_MARK)==0) sw->winPushMultiwordFeature(elem->wrd, stk); - } - ret = featureList->next(); - } - featureList->setFirst(); -} - - -/**************************************************************/ - -/* - * El objetivo de este mtodo es seleccionar o descartar una palabra para - * realizar en el entrenamiento de palabras desconocidas, calcular respectivos - * atributos e insertar esta informacin en el fichero de ejemplos correspondiente. - */ -void learner::learnerGenerateFeaturesUnk(nodo *elem, simpleList *featureList,dictionary *d, dictionary *dUnk, int direction) -{ - stack_t stk; - nodo_feature_list *aux; - int is_selected = FALSE; - char *feature = NULL; - char buffer[1000]; - - strcpy(buffer,""); - - if (d==NULL || elem==NULL || featureList==NULL) return; - - init_stack(&stk); - - sprintf(buffer,"%s:%s",elem->wrd,elem->comment); - - int i = dUnk->getElement(elem->wrd); - if (i!=HASH_FAIL) - { - /* - int i2 = d->getElement(elem->wrd); - if (dUnk->getElementNumMaybe(i) == 1 && hash_lookup(learnerUNKP_H,d->getMFT(i2))!=HASH_FAIL ) - { - fprintf(fUnknown,buffer,strlen(buffer)); - is_selected = TRUE; - } - */ - } - else - { - fprintf(fUnknown,buffer,strlen(buffer)); - is_selected = TRUE; - } - - if ( is_selected == TRUE) learnerGetFeatures(elem, &stk,dUnk, featureList, direction ); - - while (!empty(&stk) && is_selected == TRUE) - { - feature = (char *) pop(&stk); - sprintf(buffer," %s",feature); - fprintf(fUnknown,buffer,strlen(buffer)+1); - delete feature; - } - - if (is_selected == TRUE) fprintf(fUnknown,"\n"); - - strcpy(elem->pos,elem->comment); -} - - -/**************************************************************/ - -/* - * El objetivo de este mtodo es seleccionar o descartar una palabra para - * realizar en el entrenamiento de palabras conocidas, calcular respectivos - * atributos e insertar esta informacin en el fichero de ejemplos correspondiente. - */ -void learner::learnerGenerateFeatures(nodo *elem, simpleList *featureList,dictionary *d, int direction) -{ - stack_t stk; - nodo_feature_list *aux; - int is_selected = FALSE; - int is_unk = FALSE; - char *feature,buffer[1000]=""; - - if (d==NULL || elem==NULL || featureList==NULL) return; - - init_stack(&stk); - - sprintf(buffer,"%s:%s",elem->wrd,elem->comment); - - int i = d->getElement(elem->wrd); - if (i!=HASH_FAIL) - { - - if ( d->getElementNumMaybe(i)>1 && hash_lookup(learnerAMBP_H,d->getMFT(i))!=HASH_FAIL ) - { - fprintf(fKnown,buffer,strlen(buffer)); - is_selected = TRUE; - } - - } - - if ( is_selected == TRUE) learnerGetFeatures(elem, &stk,d, featureList, direction ); - - while (!empty(&stk) && is_selected == TRUE) - { - feature = (char *) pop(&stk); - sprintf(buffer," %s",feature); - /* if (i!=HASH_FAIL) - {*/ - fprintf(fKnown,buffer,strlen(buffer)+1); - - delete feature; - } - - sprintf(buffer,"\n"); - - if ( is_selected == TRUE) fprintf(fKnown,buffer,strlen(buffer)); - - strcpy(elem->pos,elem->comment); -} - - -/************************************************************/ -/* - * Ejecuta SVM-light. Recibe como parmetros cuatro cadenas de - * caracteres: svmdir es el directorio en el que se encuentra SVM-light, - * options son las opciones con que se lanzar SVM-light, posFile el nombre - * del fichero de ejemplos usado como entrada para la herramienta - * de Joachims y, por ltimo, outFile que es el nombre del fichero - * de salida. Esta funcin devuelve 0. - */ -int learner::learnerExecSVMlight(char *svmdir, char *options, char *posFile, char *outFile) -{ - time_t begin, finish; - - begin = time (0); - - char command[500]=""; - float c; - strcpy(command,""); - sprintf(command,"%s/svm_learn -v 0 %s %s %s",svmdir,options,posFile,outFile); - - if ( verbose_svmtool == TRUE ) fprintf(stderr,"Executing Joachims svm_light [ with options: %s ] ",options); - system(command); - if ( verbose_svmtool == TRUE ) fprintf(stderr," [DONE]"); - - finish = time(0); - - time_svmlight = difftime(finish,begin) + time_svmlight; - - return 0; -} - - -/**************************************************************/ - -simpleList *learner::learnerTransformHashInList(hash_t *tptr) -{ - hash_node_t *node, *last; - int i; - simpleList *l = new simpleList; - - for (i=0; isize; i++) - { - node = tptr->bucket[i]; - while (node != NULL) - { - last = node; - node = node->next; - infoDict *p = (infoDict *) last->data; - l->add( (void *)last->data); - } - } - l->setFirst(); - return l; -} - - -/**************************************************************/ - -/* - * Calcularemos el nmero de fragmentos en el que se ha de dividir - * el corpus para conseguir un porcentaje de palabras desconocidas - * determinado. Lo parmetros de entrada son el nombre del fichero - * de entrenamiento (trainingFileName), el porcentaje de palabras - * desconocidas deseado (percentage) y el nmero de frases del corpus - * (nSentences). El valor devuelto es un entero indicando el nmero de - * fragmentos. Si el nmero de fragmentos calculado es mayor que el nmero - * de frases que contiene el corpus de entrenamiento se devolver como el - * nmero de frases (nSentences). - */ -int learner::learnerNumChunks(char *trainingFileName,float percentage,int nSentences) -{ - int ret=0,ndwords=0,nwords=0; - char wrd[500]; - - FILE *f = openFile (trainingFileName,"r"); - hash_t h1; - hash_init(&h1,10000); - while (!feof(f)) - { - ret = readTo(f,' ','\n',wrd); - if (ret>=0) - { - nwords++; - char *w = new char[strlen(wrd)+1]; - strcpy(w,wrd); - if ((uintptr_t)hash_insert(&h1,w,(uintptr_t) w)==HASH_FAIL) ndwords++; - if (ret>0) readTo(f,'\n','\n',wrd); - } - } - - //Read again until a certain point where X is met --> $ndwords * (100 - $X) / 100; - float meeting = ndwords * (100 - X ) /100; - int nwords2=0,ndwords2=0; - fseek(f,0,SEEK_SET); - hash_t h2; - hash_init(&h2,10000); - while (!feof(f) && ndwords2=0) - { - nwords2++; - char *w = new char[strlen(wrd)+1]; - strcpy(w,wrd); - if (hash_insert(&h2,w,(uintptr_t) w)==HASH_FAIL) ndwords2++; - if (ret>0) readTo(f,'\n','\n',wrd); - } - } - - int chunks = nwords/(nwords - nwords2); - if (nSentences<=chunks) chunks = nSentences; - - fclose (f); - - hash_destroy(&h2); - hash_destroy(&h1); - return chunks; -} - - -/**************************************************************/ - -int learner::learnerIsInsideList(simpleList *l, char *key) -{ - if (l==NULL || key==NULL || strcmp(key,"")==0) return FALSE; - int ret = 0; - while (ret>=0) - { - infoDict *ptr = (infoDict *)l->getIndex(); - if (strcmp(key,ptr->txt)==0) - { - return TRUE; - } - } - l->setFirst(); - return FALSE; -} +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "marks.h" +#include "hash.h" +#include "list.h" +#include "dict.h" +#include "swindow.h" +#include "mapping.h" +#include "weight.h" +#include "learner.h" +#include "common.h" +#include "nodo.h" + +/**************************************************************/ + +extern int verbose; +double time_svmlight = 0; + +/**************************************************************/ + +std::stack DO; +int KERNEL=0; +int DEGREE=0; +float CK = 0; +float CU = 0; +float X = 3; +int MAX_MAPPING_SIZE = 100000; +int COUNT_CUT_OFF = 2; +int WINDOW_SIZE = 5; +int CORE_POSITION = 2; +char *TRAINSET = NULL; +char *SVMDIR = NULL; +char *NAME = NULL; +char *BLEX = NULL; +char *R = NULL; +float DRATIO = 0.001; +float ERATIO=0; +float KFILTER = 0; +float UFILTER = 0; +int REMOVE_FILES = TRUE; + +/**************************************************************/ + +char *UP = NULL; +char *AP = NULL; + +/**************************************************************/ + +//ambiguous-right [default] +std::string A0 = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\n"); +std::string A0UNK = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\na(2)\na(3)\na(4)\nz(2)\nz(3)\nz(4)\nca(1)\ncz(1)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"); + +/**************************************************************/ + +//unambiguous-right +std::string A1 = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(1)\np(2)\np(-2,-1)\np(-1,0)\np(-1,1)\np(0,1)\np(1,2)\np(-2,-1,0)\np(-2,-1,1)\np(-1,0,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\n"); +std::string A1UNK = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(1)\np(2)\np(-2,-1)\np(-1,0)\np(-1,1)\np(0,1)\np(1,2)\np(-2,-1,0)\np(-2,-1,1)\np(-1,0,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"); + +/**************************************************************/ + +//no-right +std::string A2 = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\nk(0)\nm(0)\n"); +std::string A2UNK = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\nk(0)\nm(0)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"); + +/**************************************************************/ + +//unsupervised-learning +std::string A3 = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2))\np(-2))\np(-1))\np(-2,-1))\np(-1,1))\np(1,2))\np(-2,-1,1))\np(-1,1,2))\nk(-2))\nk(-1))\nk(1))\nk(2)\n)\nm(-2))\nm(-1))\nm(1))\nm(2)\n"); +std::string A3UNK = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2))\np(-2))\np(-1))\np(-2,-1))\np(-1,1))\np(1,2))\np(-2,-1,1))\np(-1,1,2))\nk(-2))\nk(-1))\nk(1))\nk(2)\n)\nm(-2))\nm(-1))\nm(1))\nm(2)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"); + +/**************************************************************/ + +//ambiguous-right ++ unknown words on training +std::string A4 = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\n"); +std::string A4UNK = std::string( "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n"); + +/**************************************************************/ + +std::string trim(std::string const& source, char const* delims = " \t\r\n") { + std::string result(source); + std::string::size_type index = result.find_last_not_of(delims); + if(index != std::string::npos) + result.erase(++index); + + index = result.find_first_not_of(delims); + if(index != std::string::npos) + result.erase(0, index); + else + result.erase(); + return result; +} + +std::string learner::read_feature_list_from_config_file(FILE *f, char *first_feature) +{ + std::string tmp,str; + tmp = first_feature; + tmp += "\n"; + + int cont=1; + int ret=1; + + while (ret>0) + { + str.clear(); + ret = readTo(f,' ','\n',str); + if (ret>=0) + { + tmp += str + "\n"; + cont++; + } + } + return tmp; +} + +std::string learner::read_feature_list_from_string(const std::vector& tokens) +{ + std::string tmp; + + vector::const_iterator token = tokens.begin(); + for(; token != tokens.end(); token++ ) + { + string f(*token); + tmp += *token + "\n"; + } + return tmp; +} + +/**************************************************************/ + +void learner::read_config_file(const std::string& config_file) +{ + std::string configFile = config_file; + + std::ifstream file(configFile.c_str()); + std::string line; + + using namespace std; + + string delimiter(" "); + string equaldelimiter("="); + + //copy(tokens.begin(), tokens.end(), ostream_iterator(cout, ", ")); + //std::string command = trim(line.substr(1, line.find(' ')-1)); + //while( token != tokens.end() ) { cout << *token + " "; token++; } + //cout << "\n"; + //exit(0); + + while (std::getline(file,line)) { + if (! line.length()) continue; + if (line[0] == '#') continue; + + vector tokens; + Tokenize(trim(line), tokens, delimiter); + vector::iterator token = tokens.begin(); + //while( token != tokens.end() ) { cout << *token + " "; token++; } + //cout << "\n"; + + if (tokens.size() == 0) continue; + std::string command = tokens[0]; + int posEqual=line.find('='); + + if (command == "do") { + int *modelo = new int; *modelo = 0; + int *direction = new int; *direction = LEFT_TO_RIGHT; + if (tokens.size() >= 3) { + std::string MODEL = trim(tokens[1].substr(1, line.find('M'))); + *modelo = atoi(MODEL.c_str()); + if (tokens[2].compare("LR") == 0) { *direction = LEFT_TO_RIGHT; } + else if (tokens[2].compare("RL") == 0) { *direction = RIGHT_TO_LEFT; } + else if (tokens[2].compare("LRL")==0) { *direction = LR_AND_RL; } + DO.push(direction); + DO.push(modelo); + } + } + else if (posEqual >= 0) { + vector tokenseq; + Tokenize(trim(line), tokenseq, equaldelimiter); + vector::iterator token = tokens.begin(); + + if (tokens.size() < 2) continue; + + string arg = trim(tokenseq[0]); + string param = trim(tokenseq[1]); + + //cout << arg << " :: " << param << "\n"; + + vector params; + Tokenize(trim(param), params, delimiter); + + if ((arg.compare("A0k") == 0) || (arg.compare("A0") == 0)) { A0 = read_feature_list_from_string(params); } + else if ((arg.compare("A1k") == 0) || (arg.compare("A1") == 0)) { A1 = read_feature_list_from_string(params); } + else if ((arg.compare("A2k") == 0) || (arg.compare("A2") == 0)) { A2 = read_feature_list_from_string(params); } + else if ((arg.compare("A3k") == 0) || (arg.compare("A3") == 0)) { A3 = read_feature_list_from_string(params); } + else if ((arg.compare("A4k") == 0) || (arg.compare("A4") == 0)) { A4 = read_feature_list_from_string(params); } + else if ((arg.compare("A0u") == 0) || (arg.compare("A0unk") == 0)) { A0UNK = read_feature_list_from_string(params); } + else if ((arg.compare("A1u") == 0) || (arg.compare("A1unk") == 0)) { A1UNK = read_feature_list_from_string(params); } + else if ((arg.compare("A2u") == 0) || (arg.compare("A2unk") == 0)) { A2UNK = read_feature_list_from_string(params); } + else if ((arg.compare("A3u") == 0) || (arg.compare("A3unk") == 0)) { A3UNK = read_feature_list_from_string(params); } + else if ((arg.compare("A4u") == 0) || (arg.compare("A4unk") == 0)) { A4UNK = read_feature_list_from_string(params); } + + + else if (arg.compare("F") == 0) { + if (params.size() >= 2) { COUNT_CUT_OFF = atoi(params[0].c_str()); MAX_MAPPING_SIZE = atoi(params[1].c_str()); } + } + else if (arg.compare("W") == 0) { + if (params.size() >= 2) { WINDOW_SIZE = atoi(params[0].c_str()); CORE_POSITION = atoi(params[1].c_str()); } + } + else if (arg.compare("TRAINSET") == 0) { + if (params.size() >= 1) { TRAINSET = new char[params[0].length()+1]; strcpy(TRAINSET, params[0].c_str()); } + } + else if (arg.compare("BLEX") == 0) { + if (params.size() >= 1) { BLEX = new char[params[0].length()+1]; strcpy(BLEX, params[0].c_str()); } + } + else if (arg.compare("R") == 0) { + if (params.size() >= 1) { R = new char[params[0].length()+1]; strcpy(R, params[0].c_str()); } + } + else if (arg.compare("SVMDIR") == 0) { + if (params.size() >= 1) { SVMDIR = new char[params[0].length()+1]; strcpy(SVMDIR, params[0].c_str()); } + } + else if (arg.compare("NAME") == 0) { + if (params.size() >= 1) { NAME = new char[params[0].length()+1]; strcpy(NAME, params[0].c_str()); } + } + else if (arg.compare("REMOVE_FILES") == 0) { + if (params.size() >= 1) { REMOVE_FILES = atoi(params[0].c_str()); } + } + else if (arg.compare("CK") == 0) { + if (params.size() >= 1) { CK = atof(params[0].c_str()); } + } + else if (arg.compare("CU") == 0) { + if (params.size() >= 1) { CU = atof(params[0].c_str()); } + } + else if (arg.compare("Dratio") == 0) { + if (params.size() >= 1) { DRATIO = atof(params[0].c_str()); } + } + else if (arg.compare("Eratio") == 0) { + if (params.size() >= 1) { ERATIO = atof(params[0].c_str()); } + } + else if (arg.compare("Kfilter") == 0) { + if (params.size() >= 1) { KFILTER = atof(params[0].c_str()); } + } + else if (arg.compare("Ufilter") == 0) { + if (params.size() >= 1) { UFILTER = atof(params[0].c_str()); } + } + else if (arg.compare("X") == 0) + { + if (params.size() >= 1) { X = atof(params[0].c_str()); } + } + else if (arg.compare("AP") == 0) + { + learnerAMBP_H = new hash_t; + learnerAMBP_H->hash_init(30); + vector::iterator p = params.begin(); + while( p != params.end()) + { + infoDict * etiq = new infoDict; + string tag = *p; + etiq->pos = tag.c_str(); + learnerAMBP_H->hash_insert(etiq->pos,etiq); + p++; + } + } + else if (arg.compare("UP") == 0) { + learnerUNKP_H = new hash_t; + learnerUNKP_H->hash_init(30); + vector::iterator p = params.begin(); + while( p != params.end()) { + infoDict * etiq = new infoDict; + string tag = *p; + etiq->pos = tag.c_str(); + learnerUNKP_H->hash_insert(etiq->pos,etiq); + p++; + } + } + } + } + + if (verbose==TRUE) { + fprintf(stderr,"\n* ===================== SVMTlearn configuration =========================="); + fprintf(stderr,"\n* config file = [ %s ]\n* trainset = [ %s ]\n* model name = [ %s ]",config_file.c_str(),TRAINSET,NAME); + fprintf(stderr,"\n* SVM-light dir = [ %s ]",SVMDIR); + fprintf(stderr,"\n* ========================================================================"); + fprintf(stderr,"\n* unknown words expected = [ X = %f %% ]",X); + fprintf(stderr,"\n* C parameter for known = [ CK = %f ]",CK); + fprintf(stderr,"\n* C parameter for unknown = [ CU = %f ]",CU); + fprintf(stderr,"\n* D ratio = [ Dratio = %f ]",DRATIO); + fprintf(stderr,"\n* E ratio = [ Eratio = %f ]",ERATIO); + fprintf(stderr,"\n* Known weights filter = [ Kfilter = %f ]",KFILTER); + fprintf(stderr,"\n* Unknown weights filter = [ Ufilter = %f ]",UFILTER); + fprintf(stderr,"\n* sliding window settings = [ WINDOW SIZE = %d , CORE POSITION = %d ]",WINDOW_SIZE,CORE_POSITION); + fprintf(stderr,"\n* mapping settings = [ COUNT CUT OFF = %d , MAX MAPPING SIZE = %d ]",COUNT_CUT_OFF,MAX_MAPPING_SIZE); + fprintf(stderr,"\n* remove temporal files = [ %d ] (1) TRUE, (0) FALSE",REMOVE_FILES); + fprintf(stderr,"\n* ========================================================================"); + } + + if (TRAINSET == NULL) { fprintf (stderr,"\nError: TRAINSET parameter not found in %s.\n",config_file.c_str()); exit(-1); } + if (NAME == NULL) { fprintf (stderr,"\nError: MODEL NAME parameter not found in %s.\n",config_file.c_str()); exit(-1); } + if (SVMDIR == NULL) { fprintf (stderr,"\nError: SVM DIRECTORY parameter not found in %s.\n",config_file.c_str()); exit(-1); } +} + +/**************************************************************/ + +learner::learner() +{ + learnerAMBP_H = NULL ; + learnerUNKP_H = NULL ; + learnerNumFeatures = 1000; +} + +learner::~learner() +{ + if (REMOVE_FILES==TRUE) + { + removeFiles(NAME,RM_TEMP_FILES,0,0,verbose); + } + + if ( verbose == TRUE ) fprintf(stderr,"\n\nTERMINATION ... "); + + if (TRAINSET!=NULL) delete TRAINSET; + if (SVMDIR!=NULL) delete SVMDIR; + if (NAME!=NULL) delete NAME; + if (BLEX!=NULL) delete BLEX; + if (R!=NULL) delete R; + + if ( verbose == TRUE ) fprintf(stderr,"[DONE]\n\n"); +} + +/***************************************************************/ + +void learner::learnerCreatePOSFile(const std::string& modelName, int is_ambp, hash_t< infoDict* >* h) +{ + std::string name; + if (is_ambp==TRUE) + { + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.AMBP",modelName.c_str()); + name = modelName + ".AMBP"; + FILE *f = openFile (name.c_str(),"w"); + h->hash_print(f); + fclose(f); + } + else + { + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.UNKP",modelName.c_str()); + name = modelName + ".UNKP"; + FILE *f = openFile (name.c_str(),"w"); + h->hash_print(f); + fclose(f); + } +} + +/***************************************************************/ + +void learner::learnerCreateDefaultFile(const std::string &modelName, const std::string& str) +{ + std::string name; + name = modelName+ "." + str; + FILE *f = openFile(name.c_str(), "w"); + + if (str == "A0") + { + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A0",modelName.c_str()); fprintf(f,"%s",A0.c_str()); + name = modelName + "."+str+".UNK"; + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A0.UNK",modelName.c_str()); + FILE *funk = openFile(name.c_str(), "w"); + fprintf(funk,"%s",A0UNK.c_str()); + fclose(funk); + } + else if (str == "A1") + { + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A1",modelName.c_str()); fprintf(f,"%s",A1.c_str()); + name = modelName + "."+str+".UNK"; + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A1.UNK",modelName.c_str()); + FILE *funk = openFile(name.c_str(), "w"); + fprintf(funk,"%s",A1UNK.c_str()); + fclose(funk); + } + else if (str == "A2") + { + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A2",modelName.c_str()); fprintf(f,"%s",A2.c_str()); + name = modelName+"."+str+".UNK"; + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A2.UNK",modelName.c_str()); + FILE *funk = openFile(name.c_str(), "w"); + fprintf(funk,"%s",A2UNK.c_str()); + fclose(funk); + } + else if (str == "A3") + { + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A3",modelName.c_str()); fprintf(f,"%s",A3.c_str()); + name = modelName+"."+str+".UNK"; + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A3.UNK",modelName.c_str()); + FILE *funk = openFile(name.c_str(), "w"); + fprintf(funk,"%s",A3UNK.c_str()); + fclose(funk); + } + else if (str == "A4") + { + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A4",modelName.c_str()); fprintf(f,"%s",A4.c_str()); + name = modelName + "."+str+".UNK"; + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A4.UNK",modelName.c_str()); + FILE *funk = openFile(name.c_str(), "w"); + fprintf(funk,"%s",A4UNK.c_str()); + fclose(funk); + } + else if (str == "WIN") + { + if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.WIN",modelName.c_str()); + fprintf(f,"%d\n%d\n",WINDOW_SIZE,CORE_POSITION); + } + fclose(f); +} + +/***************************************************************/ + +/* + * Return CHAR_NULL if end of file + * Return w,p,k (if we readed a) or s (if we readed m) if it's found + */ +char learner::obtainAtrChar(FILE *channel) +{ + char c; + while (!feof(channel)) + { + c=fgetc(channel); + if (c=='w' || c=='p' || c=='k' || c=='m') + { + fgetc(channel); + switch (c) + { + case 'k': return 'k'; + case 'm': return 's'; + default: return c; + } + } + } + return CHAR_NULL; +} + +/**************************************************************/ + +/* + * Cada atributo de una lista de atributos tiene la forma + * () + * Usaremos esta función para leer cada uno de los enteros de + * la cadena que indica las posiciones de la ventana a considerar. + * Lee del fichero (channel) un entero entre '(' y coma, comas, + * coma y ')’ o paréntesis. Devuelve el número leído como entero. + */ +int learner::obtainAtrInt(FILE *channel,int *endAtr) +{ + char c=' '; + std::string num; + + while ( (!feof(channel)) && (c!='(') && (c!=',') && (c!=')') ) + { + c=fgetc(channel); + if ((c!='(') && (c!=')')) num += c; + } + if (c==')') *endAtr=1; + return atoi(num.c_str()); +} + +/***************************************************************/ + +/* int learnerCount(char *name, int *nWords, int *nSentences) + * Cuenta el numero de palabras y frases que tiene el corpus + * contenido en el fichero con nombre . El numero de + * palabras y de frases es devuelto como parametro de salida + * mediante y . + */ +void learner::learnerCount(const std::string& name, int *nWords, int *nSentences) +{ + int ret1=0,ret2=0,cont=0,contWords=0; +// char c=' '; + std::string str; + + FILE *f = openFile(name.c_str(), "r"); + while (!feof(f) && ret1>=0 && ret2>=0) + { + ret1 = readTo(f,' ',0,str); + if (str == "!" || str == "?" || str == ".") cont++; + ret2 = readTo(f,'\n',0,str); + if ( !feof(f) && ret1 >= 0 ) contWords++; + } + fclose(f); + + if ( contWords <= 1 ) + { + fprintf(stderr,"\n\nInput corpus too short to begin training!! Program stopped.\n\n"); + exit(0); + } + + *nWords = contWords; + *nSentences = cont; +} + + +/********************************************************/ + +void learner::learnerPrintMessage(int numModel, int K_or_U, int LR_or_RL, int is_fex) +{ + if (verbose==TRUE) + { + fprintf(stderr,"\n\n* ========================================================================"); + if (is_fex==FALSE) fprintf(stderr,"\n* TRAINING MODEL %d ",numModel); + else fprintf(stderr,"\n* FEATURES EXTRACTION FOR MODEL %d ",numModel); + if (is_fex==TRUE) fprintf(stderr,"[ KNOWN AND UNKNOWN WORDS - "); + else if (K_or_U==KNOWN) fprintf(stderr,"[ KNOWN WORDS - "); + else fprintf(stderr,"[ UNKNOWN WORDS - "); + if (LR_or_RL==LEFT_TO_RIGHT) fprintf(stderr,"LEFT TO RIGHT ]"); + else if (LR_or_RL==RIGHT_TO_LEFT) fprintf(stderr,"RIGHT TO LEFT ]"); + else if (LR_or_RL==LR_AND_RL)fprintf(stderr,"LEFT TO RIGHT AND RIGHT TO LEFT ]"); + fprintf(stderr,"\n* ========================================================================"); + } +} + +/**************************************************************/ + +/* + * Parámetros: + * wrd char * Palabra de ejemplo que se está tratando. + * numModel int Modelo que estamos entrenando. + * direction int Dirección en la cual se realiza el entrenamiento + (Derecha a izquierda o izquierda a derecha). + * Known_or_Unknown int Si se está entrenando para palabras conocidas + o desconocidas. + * pos char * La etiqueta morfosintáctica que estamos entrenando. + * samplePos char * Etiqueta morfosintáctica del ejemplo que estamos tratando. + * features char * Lista de atributos generados para el ejemplo. + * d dictionary * Diccionario que se está usando para el entrenamiento. + * nNeg int * Apuntador al número de palabras seleccionadas como ejemplos negativos. + * nPos int * Apuntador al número de palabras seleccionadas como ejemplos positivos. + * + * Este método puede usarse para seleccionar ejemplos para palabras conocidas + */ +void learner::learnerPushSample(const std::string& wrd,int numModel,int direction, int Known_or_Unknown,const std::string& pos, const std::string& samplePos, const std::string& features,dictionary *d, int *nNeg, int *nPos) +{ + std::string fileName; + //Se abre el fichero donde se debe insertar el ejemplo. + generateFileName(NAME,pos,numModel,direction, Known_or_Unknown, "POS", fileName); + + FILE *f = openFile(fileName,"a+"); + + //Se obtiene la lista de posibles etiquetas morfosintácticas para + //la palabra wrd. + simpleList* l = learnerGetPotser(wrd,Known_or_Unknown,d); + l->setFirst(); + for (bool stop=false; !stop ; stop = l->next()) + { + //Se busca la etiqueta pos en la lista obtenida. + //Si se encuentra y es igual a samplePos se selecciona el ejemplo + //como positivo, si no es igual a samplePos se selecciona como + //negativo. + infoDict *pInfo = *l->getIndex(); + if (pInfo != NULL ) + { + if (pInfo->pos == pos) + { + if (pInfo->pos == samplePos) + { + //Positive Sample + *nPos=(*nPos)+1; + fprintf (f,"+1 %s\n",features.c_str()); + } + else + { + //Negative Sample + *nNeg=(*nNeg)+1; + fprintf (f,"-1 %s\n",features.c_str()); + } + } + } + } + + d->dictCleanListInfoDict(l,l->numElements()); + delete l; + //Se cierra el fichero. + fclose(f); +} + +/**************************************************************/ +/* + * Parámetros: + * wrd Char * Palabra del ejemplo que se está tratando. + * numModel Int Modelo que estamos entrenando. + * direction Int Dirección en la cual se realiza el entrenamiento + (Derecha a izquierda o izquierda a derecha). + * Known_or_Unknown Int Si se está entrenando para palabras conocidas + o desconocidas. + * pos Char * La etiqueta morfosintáctica que estamos entrenando. + * samplePos Char * Etiqueta morfosintáctica del ejemplo que estamos tratando. + * features Char * Lista de atributos generadas para el ejemplo. + * d dictionary * Diccionario que se está usando para el entrenamiento. + * nNeg int * Apuntador al número de palabras seleccionadas como ejemplos negativos. + * nPos int * Apuntador al número de palabras seleccionadas como ejemplos positivos. + * Este método se encarga de seleccionar ejemplos para palabras desconocidas. +*/ +void learner::learnerPushSampleUnk(const std::string& /*wrd*/, int numModel, int direction, int Known_or_Unknown, const std::string& pos, const std::string& samplePos, const std::string& features, dictionary* /*d*/, int* nNeg, int* nPos) +{ + std::string fileName; + generateFileName(NAME,pos,numModel,direction, Known_or_Unknown, "POS", fileName); + //Abrimos el fichero + FILE *f = openFile(fileName,"a+"); + + //Si pos es igual a samplePos se selecciona el ejemplo como positivo, + //en cualquier otro caso se selecciona como negativo. + if (samplePos == pos) + { + //Positive Sample + *nPos=(*nPos)+1; + fprintf (f,"+1 %s\n",features.c_str()); + } + else + { + //Negative Sample + *nNeg=(*nNeg)+1; + fprintf (f,"-1 %s\n",features.c_str()); + } + //Cerramos el fichero + fclose(f); +} + +/**************************************************************/ +/* + * Este método recibe como parámetros un apuntador a un fichero (f) + * y el apuntador a un objeto del tipo mapping. El objetivo de este + * método es leer los atributos generados para un ejemplo del fichero + * f y devolver un string con la lista de atributos en el formato + * esperado por SVM-light. + */ +std::string learner::learnerCreateFeatureString(FILE *f,mapping *m) +{ + std::string features; + int array[learnerNumFeatures]; + int ret1=1,i = 0; + std::string str; + + //Construimos un array de enteros con los + //identificadores numéricos de cada atributo + while (ret1>0 && !feof(f)) + { + ret1 = readTo(f,' ','\n',str); + int num = m->mappingGetNumberByFeature(str.c_str()); + if (ret1>=0 && num>-1) + { + array[i]=num; + i++; + } + } + + //qsort --> ordena ascendetemente un array de enteros + qsort(array,0,i-1); + + for (int j=0;j=0 && ret2>=0) + { + int isPossiblePOS = learnerIsPossiblePOS(wrd,pos,K_or_U); + if (isPossiblePOS==TRUE) + { + //Preparamos la lista de features + features = learnerCreateFeatureString(f,m); + + if (K_or_U==KNOWN) learnerPushSample(wrd,numModel,direction,K_or_U,pos,samplePos,features,d,nNeg,nPos); + else learnerPushSampleUnk(wrd,numModel,direction,K_or_U,pos,samplePos,features,d,nNeg,nPos); + + if ( verbose == TRUE) showProcessDone(cont , 1000, FALSE,"samples"); + } + else readTo(f,'\n','\n',garbage); + cont++; + } //if + } //While +} + +/**************************************************************/ + +/* + * Parámetros: + * f FILE* Apuntador al fichero de ejemplos + * numModel Int Modelo que estamos entrenando. + * LR_or_RL Int Dirección en la cual se realiza el entrenamiento + * (Derecha a izquierda o izquierda a derecha). + * K_or_U Int Si se está entrenando para palabras conocidas o desconocidas. + * d dictionary* Diccionario que se está usando para el entrenamiento. + * lPosToTrain simpleList * Lista de etiquetas morfosintácticas a entrenar + * + * Este método es el encargado de realizar el aprendizaje. Prepara las opciones de + * ejecución para SVM-light. Construye el mapping mediante los datos contenidos + * en el fichero de entrada. Crea un depósito de pesos (weightRepository) y + * un hashing en el que almacenar los sesgos obtenidos para cada etiqueta. + * Y prepara los datos para que puedan ser procesados por SVM-light. + * Para cada etiqueta morfosintáctica de la lista lPosToTrain, se + * llama al método learnerDressNakedSetTrain para conseguir los ficheros + * de entrada para SVM-light. Una vez hecho esto, se llama a learnerExecSVMlight, + * para ejecutar SVM-light. Con el fichero de salida generado por la herramienta + * de Joachims, se rellenan las estructuras de datos con los pesos + * (learnerBuiltWeightRepository) y los sesgos (learnerBuiltBias) + * Una vez procesadas todas las etiquetas morfosintácticas a entrenar, el depósito + * de pesos y el hashing de sesgos contienen todos los datos del modelo y se escriben + * en disco mediante los métodos weightRepository.wrWriteHash para los sesgos y + * weightRepository.wrWrite para los pesos. + */ + +void learner::learnerDoLearn(FILE* f, int numModel, int LR_or_RL, int K_or_U, dictionary* d, simpleList< infoDict* >* lPosToTrain) +{ + std::string posFileName,svmFileName,mapFileName; + + //Preparamos las opciones con que se ejecutará svm-light + std::ostringstream options; + options << std::fixed << std::setprecision(6); + if (CK!=0 && K_or_U==KNOWN) options << " -c "<mappingBuilt(f,MAX_MAPPING_SIZE,COUNT_CUT_OFF); + generateFileName(NAME,"",numModel,LR_or_RL,K_or_U,"MAP",mapFileName); + m->mappingWrite(mapFileName.c_str(),FALSE); + + //Creamos el depósito de pesos y el hash de sesgos + hash_t *b = new hash_t; + b->hash_init(30); + weightRepository *wr = new weightRepository; + + infoDict *pInfo; + int nPositive=0,nNegative=0; + + //Nos situamos en el primer elemento de lista + lPosToTrain->setFirst(); + //Para cada elemento de la lista de etiquetas + for (int ret=true; ret; ret=lPosToTrain->next()) + { + if ( verbose == TRUE ) fprintf(stderr,"\n-----------------------------------------------------------"); + + //Obtenemos la etiqueta morfosintáctica + pInfo = *lPosToTrain->getIndex(); + nPositive=0; + nNegative=0; + + //Preparamos el entrenamiento + generateFileName(NAME,pInfo->pos,numModel,LR_or_RL,K_or_U,"POS",posFileName); + generateFileName(NAME,pInfo->pos,numModel,LR_or_RL,K_or_U,"SVM",svmFileName); + + //Seleccionamos los ejemplos para el entrenamiento para la POS que estamos viendo + if ( verbose == TRUE ) { fprintf(stderr,"\nPreparing training set for [ %s ] ..",pInfo->pos.c_str()); } + +learnerDressNakedSetTrain(d,m,f,pInfo->pos,numModel,LR_or_RL, K_or_U,&nPositive,&nNegative); + + //cout << "\nPUTOOOOOOO\n"; + + //exit(0); + + + //Realizamos el entrenamiento llamando a SVM-light + if ( verbose == TRUE ) fprintf(stderr,"\nTraining [ %s ] with %d samples: [+] = %d samples ; [-] = %d samples\n",pInfo->pos.c_str(),nPositive+nNegative,nPositive,nNegative); + learnerExecSVMlight(SVMDIR,options.str(),posFileName,svmFileName); + + //Se insertan los valores obtenidos del entrenamiento en el depósito de pesos + //y el el hashing de sesgos + if ( verbose == TRUE ) fprintf(stderr,"\nAdding elements to MERGED MODEL from [ %s ]",posFileName.c_str()); + wr = learnerBuiltWeightRepository(wr,m,pInfo->pos,svmFileName); + if ( verbose == TRUE ) fprintf(stderr," [DONE]"); + if ( verbose == TRUE ) fprintf(stderr,"\nAdding biases from [ %s ]",posFileName.c_str()); + b = learnerBuiltBias(b,pInfo->pos,svmFileName.c_str()); + if ( verbose == TRUE ) fprintf(stderr," [DONE]"); + + if (REMOVE_FILES == TRUE) + { + //char cmd[150]; + //sprintf(cmd,"rm -f %s",posFileName); + remove(posFileName.c_str()); //system(cmd); + //sprintf(cmd,"rm -f %s",svmFileName); + remove(svmFileName.c_str()); //system(cmd); + } + } + lPosToTrain->setFirst(); + + if ( verbose == TRUE ) fprintf(stderr,"\n-----------------------------------------------------------"); + std::string fileName; + generateFileName(NAME,"",numModel,LR_or_RL,K_or_U,"MRG",fileName); + if ( verbose == TRUE ) fprintf(stderr,"\nStoring MERGED MODEL [ %s ]",fileName.c_str()); + //Modificación 180705: Filtrado de pesos + if ( K_or_U == KNOWN ) wr->wrWrite(fileName, KFILTER); //ADD 180705 + else wr->wrWrite(fileName, UFILTER); //ADD 180705 + //Escribir deposito de pesos en disco + //wr->wrWrite(fileName); //DEL 180705 + if ( verbose == TRUE ) fprintf(stderr," [DONE]"); + + FILE *fwr = openFile(fileName,"a+"); + fprintf (fwr,"BIASES "); + wr->wrWriteHash(b,fwr,' '); //Escribir biases en fichero de depósito de pesos + fclose(fwr); + + generateFileName(NAME,"",numModel,LR_or_RL,K_or_U,"B",fileName); + if ( verbose == TRUE ) fprintf(stderr,"\nStoring BIASES [ %s ]",fileName.c_str()); + FILE *fb =openFile(fileName,"w"); + wr->wrWriteHash(b,fb,'\n'); //Escribir biases en fichero de sesgos + fclose(fb); + + if ( verbose == TRUE ) fprintf(stderr," [DONE]"); + + delete m; + delete wr; + learnerDestroyBias(b); +} + +/*******************************************************/ +/* + * Recibe como parámetro el nombre del fichero de entrenamiento + * (trainingFileName), el diccionario para palabras conocidas (dKnown), + * el número del modelo a entrenar (numModel), la dirección en la cual + * se realiza el entrenamiento (direction), el número de frases del + * corpus (numSent), el número de palabras del corpus (numWords) y + * el número de fragmentos en que se ha de dividir el corpus para + * entrenar palabras desconocidas (numChunks). + * + * Este método selecciona ejemplo y realiza el entrenamiento del modelo + */ +void learner::learnerTrainModel(const std::string& trainingFileName, dictionary *dKnown,int numModel, int direction, int /*numSent*/,int numWords, int numChunks) +{ + FILE *fKnownRL,*fUnknownRL,*fUnknownLR,*fKnownLR; + int contSentences = 0, ret = 1; + std::ostringstream name; + + //Carga las listas de atributos + simpleList featureList,featureListUnk; + name << std::string(NAME) << std::string(".A") << numModel; + createFeatureList(name.str(),&featureList); + name.rdbuf()->str(""); + name << NAME << ".A"<str(""); + name << std::string(NAME) << ".M" << numModel << ".RL.SAMPLES"; + fKnownRL=openFile(name.str().c_str(),"w+"); + name.rdbuf()->str(""); + name << std::string(NAME) << ".UNK.M" << numModel << ".RL.SAMPLES"; + fUnknownRL=openFile(name.str().c_str(),"w+"); + } + if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL) + { + name.rdbuf()->str(""); + name << std::string(NAME)<<".M"<str(""); + name << std::string(NAME) << ".UNK.M"<=0 && idictRepairFromFile(R); + else dUnknown->dictRepairHeuristic(DRATIO); + name.rdbuf()->str(""); + name << std::string(NAME) << ".DICT."<dictWrite(name.str()); + delete dUnknown; + dUnknown = new dictionary(name.str()); + + //Si el modelo en 4 usaremos el mismo diccionario para conocidas y desconocidas + dictionary *d_for_known = NULL; + //if (numModel==4) dKnown = dUnknown; + if ( numModel == 4 ) { d_for_known = dUnknown; } + else { d_for_known = dKnown; } + + if ( verbose == TRUE ) fprintf(stderr,"\nExtracting features : "); + + nWordsLR = chunkSize; + nWordsRL = chunkSize; + + //Mientras haya palabras por leer y no estemos al final del chunk + while ( ret>=0 && is_end_of_chunk==FALSE ) + { + is_end_of_sentence = FALSE; + //Si es LR o LRL + if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL) + { + fKnown = fKnownLR; fUnknown = fUnknownLR; + //Recorremos el texto en sentido LR para seleccionar ejemplos + nWordsLR = learnerLeftToRight(&featureList,&featureListUnk, d_for_known, dUnknown, nWordsLR, inicioLR); + inicioLR = sw->getIndex()->ord; + if (!sw->next()) + { + is_end_of_sentence = TRUE; + inicioLR = -1; + } + } + //Si es RL o LRL + if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL) + { + fKnown = fKnownRL; fUnknown = fUnknownRL; + //Recorremos el texto en sentido RL para seleccionar ejemplos + nWordsRL = learnerRightToLeft(&featureList,&featureListUnk, d_for_known , dUnknown, nWordsRL, inicioRL); + inicioRL = sw->getIndex()->ord; + if ( !sw->previous() ) + { + is_end_of_sentence = TRUE; + inicioRL = -1; + } + } + + contSentences++; + + if ( verbose == TRUE) showProcess(contSentences,0); + //Si es fin de frase recargamos la ventana + if ( is_end_of_sentence == TRUE ) + { + sw->deleteList(); + ret = sw->iniGeneric(dKnown); + } + + //Si hemos recorrido todas la palabras del chunk es fin de chunk + if ( nWordsRL <= 0 || nWordsLR <= 0 ) + { + is_end_of_chunk = TRUE; + } + } + + if ( verbose == TRUE) showProcess(contSentences,1); + delete dUnknown; + + //Si es necesario borramos los ficheros temporales + //char cmd[200]; + //sprintf(cmd, "rm -f %s",name); + if (REMOVE_FILES == TRUE) remove(name.str().c_str());//system(cmd); + } + + //Creamos una copia de la lista de etiquetas para palabras conocidas + simpleList* copyOfUNKP_L = learnerTransformHashInList(learnerUNKP_H); + //para cada tipo known o unk + //para cada direccion + if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL) + { + //Realizamos el entrenamiento + learnerDoLearn(fKnownLR,numModel,LEFT_TO_RIGHT,KNOWN,dKnown,learnerAMBP_L); + learnerDoLearn(fUnknownLR,numModel,LEFT_TO_RIGHT,UNKNOWN,dKnown,copyOfUNKP_L); + fclose (fKnownLR); + fclose (fUnknownLR); + } + + if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL) + { + //Realizamos el entrenamiento + learnerDoLearn(fKnownRL,numModel,RIGHT_TO_LEFT,KNOWN,dKnown,learnerAMBP_L); + learnerDoLearn(fUnknownRL,numModel,RIGHT_TO_LEFT,UNKNOWN,dKnown,copyOfUNKP_L); + fclose(fKnownRL); + fclose(fUnknownRL); + } + //fin para cada direccion + //fin para cada tipo + + destroyFeatureList(&featureList); + destroyFeatureList(&featureListUnk); +} + +/**********************************************************/ + +/* + * Recibe como parámetros: wrd un string con una palabra, + * pos un string con una etiqueta, Known_or_Unknown si la + * palabra es conocida o desconocida. Si la palabra es conocida + * devuelve TRUE si la etiqueta pos está en la lista de etiquetas + * morfosintácticas ambiguas. Si la palabra es desconocida + * devuelve TRUE si la etiqueta pos pertenece a lista de posibles + * categorías morfosintácticas para palabras desconocidas. En cualquier + * otro caso devuelve FALSE. + */ +int learner::learnerIsPossiblePOS(const std::string& /*wrd*/, const std::string& pos, int Known_or_Unknown) +{ + if (Known_or_Unknown==KNOWN) + { + if (HASH_FAIL != (long)learnerAMBP_H->hash_lookup(pos)) + { + return TRUE; + } + } + else if (Known_or_Unknown==UNKNOWN) + { + if (HASH_FAIL != (long)learnerUNKP_H->hash_lookup(pos)) + { + return TRUE; + } + } + return FALSE; +} + +/**************************************************************/ +/* + * Recibe como parámetros: + * wrd un string con una palabra, + * Known_or_Unknown un entero que indica si la palabra + * es conocida o desconocida y + * un apuntador d al diccionario. Devuelve un apuntador a una lista. + */ +simpleList* learner::learnerGetPotser(const std::string& wrd, int Known_or_Unknown, dictionary* d) +{ + bool stop=false,ret=true; + dataDict* w = d->getElement(wrd); + infoDict *pInfoDict; + simpleList* lout = new simpleList(); + + //Si es conocida recoge las posibles etiquetas que contiene el diccionario + //para la palabra wrd y mira si existen en la lista de categorías + //morfosintácticas ambiguas. Si existen en la lista de etiquetas ambiguas + //las añade a la lista de salida, en caso contrario si no esta es posible + //etiqueta ambigua y es la etiqueta más frecuente se devuelve una lista + //sólo con la etiqueta más frecuente. + if (Known_or_Unknown==KNOWN) + { + if ((long)w != HASH_FAIL) + { + simpleList* list = &d->getElementMaybe(w); + list->setFirst(); + while (ret && !stop ) + { + pInfoDict = *list->getIndex(); + + if (HASH_FAIL!=(long)learnerAMBP_H->hash_lookup(pInfoDict->pos)) + { + infoDict *ptr = new infoDict; + ptr->pos = pInfoDict->pos; + ptr->num = pInfoDict->num; + lout->add(ptr); + } + else if ( d->getMFT(w)->pos == pInfoDict->pos ) //si es most frequent tag + { + d->dictCleanListInfoDict(lout,lout->numElements()); + lout->deleteList(); + infoDict *ptr = new infoDict; + ptr->pos = pInfoDict->pos; + ptr->num = pInfoDict->num; + lout->add(ptr); + stop = true; + } + ret=list->next(); + } + list->setFirst(); + } + } + //Si la palabra es desconocida la lista de salida contiene todas las + //categorías morfosintácticas posibles desconocidas. + else if (Known_or_Unknown==UNKNOWN) + { + learnerUNKP_L->setFirst(); + while (ret>=0) + { + pInfoDict =*learnerUNKP_L->getIndex(); + infoDict *ptr = new infoDict; + ptr->pos = pInfoDict->pos; + ptr->num = pInfoDict->num; + lout->add(ptr); + + ret=learnerUNKP_L->next(); + } + learnerUNKP_L->setFirst(); + } + + return lout; +} + + +/**********************************************************/ +/* + * Recibe como parámetros: + * un apuntador al depósito de pesos (wr), + * un apuntador a mapping (m), + * un string con la etiqueta (pos) que estamos entrenando y + * el nombre del fichero (fileName) en el cual se almacenan las SVs + * para la etiqueta indicada. + * + * Partiendo de estos datos recorre el fichero fileName, traduciendo + * cada uno de los atributos leídos gracias al mapping (m). + * Y añadiendo la correspondiente pareja etiqueta/atributo + * al depósito de pesos wr mediante el método wrAdd. + * + * Se devuelve el apuntador al depósito de pesos (weightRepository) + * que ha sido modificado. + */ +weightRepository *learner::learnerBuiltWeightRepository(weightRepository* wr, mapping* m, const std::string& pos, const std::string& fileName) +{ + std::string str; + std::string key; + FILE *f = openFile(fileName,"r"); + int ret=1,trobat=FALSE; + + while (!feof(f) && ret>=0) + { + if (trobat==FALSE) + { + ret= readTo(f,'\n',0,str); + if (str.find("threshold")!=std::string::npos) trobat=TRUE; + } + else + { + ret = readTo(f,' ',0,str); + long double ld; + + ld = atof(str.c_str()); + + while (ret>0) + { + ret = readTo(f,':','\n',str); + if (ret>0) + { + key = m->mappingGetFeatureByNumber(str.c_str()); + ret = readTo(f,' ','\n',str); + //if ((int)key != HASH_FAIL) wr->wrAdd(key,pos,ld); + if (atoi(key.c_str()) != HASH_FAIL) wr->wrAdd(key,pos,ld); + } + } + } + } + fclose (f); + return wr; +} + +/**************************************************************/ + +void learner::learnerDestroyBias(hash_t *h) +{ + h->hash_destroy(); +} + +/**************************************************************/ + +/* + * Recibe como parámetros: + * Un apuntador a hash_t (h) que es el lugar donde almacenaremos los sesgos, + * la etiqueta (pos) que estamos entrenando y el fichero (fileName) de salida + * de SVM-light en el cual se encuentran los datos esperados. + * Este método lee el sesgo del fichero y lo añade (hash_insert) + * al hashing h para la etiqueta pos indicada. + * Posteriormente devuelve el apuntador al hashing modificado. + */ +hash_t *learner::learnerBuiltBias(hash_t* h, const std::string& pos, const std::string& fileName) +{ + std::string str; + FILE *f = openFile(fileName.c_str() ,"r"); + int ret=1,trobat=FALSE; + + while (!feof(f) && trobat==FALSE && ret>=0) + { + ret= readTo(f,'\n',0,str); + if (str.find("threshold") != std::string::npos) trobat=TRUE; + } + + std::string bias; + bool sortir = false; + for (std::string::size_type i=0;(ipos = pos; + w->data = (long double)0; + w->data = atof (bias.c_str()); + h->hash_insert(w->pos, w); + + fclose(f); + return h; +} + +/********************************************************/ + +/* + * Recibe como parámetro el nombre del fichero de configuración (train). + * Ejecuta el aprendizaje + */ +void learner::learnerRun(const std::string& train) +{ + int iWrd=0,iSent=0; + + //Leemos el fichero de configuración + read_config_file(train); + //obtener tamaño del corpus + learnerCount(TRAINSET,&iWrd,&iSent); + + if ( verbose == TRUE ) + { + fprintf(stderr,"\n* trainset # words = [ %d ]",iWrd); + fprintf(stderr,"\n* trainset # sentences = [ %d ]",iSent); + fprintf(stderr,"\n* ========================================================================"); + fprintf(stderr,"\n\n* ========================================================================"); + fprintf(stderr,"\n* PREPARING TRAINING"); + fprintf(stderr,"\n* ========================================================================"); + } + struct tms tbuff1,tbuff2; + clock_t start,end; + start = times(&tbuff1); + + //Creamos el diccionario + std::string name; + name = std::string(NAME) + ".DICT"; + dictionary *d = new dictionary(TRAINSET, 0,0); + if (R!=NULL) d->dictRepairFromFile(R); // "/mnt/hda4/pfc/WSJ.200"); + else d->dictRepairHeuristic(DRATIO); + d->dictWrite(name); + delete d; + d = new dictionary(name); + + //Obtenemos las listas de etiquetas + if (learnerAMBP_H == NULL) + learnerAMBP_H = d->dictFindAmbP(&learnerNumAMBP); + if (learnerUNKP_H == NULL) + learnerUNKP_H = d->dictFindUnkP(&learnerNumUNKP); + //Creamos los ficheros de etiquetas + learnerCreatePOSFile(NAME,TRUE,learnerAMBP_H); + learnerCreatePOSFile(NAME,FALSE,learnerUNKP_H); + //Creamos unos hashings con las listas de etiquetas + learnerAMBP_L = learnerTransformHashInList(learnerAMBP_H); + learnerUNKP_L = learnerTransformHashInList(learnerUNKP_H); + + //creamos el fichero de configuración de la ventana + learnerCreateDefaultFile(NAME,"WIN"); + + //Calculamos en numero de chunks + int chunks = learnerNumChunks(TRAINSET,X,iSent); + + //Mientras haya modelos por entrenar + while (!DO.empty()) + { + int *numModel = DO.top(); + DO.pop(); + int *direction = DO.top(); + DO.pop(); + + //Eliminamos los ficheros anteriores + removeFiles(NAME, RM_MODEL_FILES,*numModel, *direction, verbose); + removeFiles(NAME, RM_TEMP_FILES ,*numModel, *direction, verbose); + + std::ostringstream name; + name << std::string("A") << *numModel; + learnerCreateDefaultFile(NAME,name.str()); + + //Creamos la ventana + std::ifstream f(TRAINSET); + sw = new swindow(f,WINDOW_SIZE,CORE_POSITION, &std::cerr, d); + + //ejecutamos el entrenamiento para el modelo + learnerTrainModel(TRAINSET,d,*numModel,*direction,iSent,iWrd,chunks); + delete sw; + } + delete d; + + end = times(&tbuff2); + if ( verbose == TRUE ) + { + fprintf(stderr,"\n\n* ========================================================================\n"); + showTime ("* SVM-light Time", time_svmlight, time_svmlight, 0); + showTime ("* SVMTlearn Time", + ((double)(end-start))/CLOCKS_PER_SECOND - time_svmlight, //CLK_TCK, + ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime)/CLOCKS_PER_SECOND, + ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime)/CLOCKS_PER_SECOND); + showTime ("* Total Learning Time", + time_svmlight + ((double)(end-start))/CLOCKS_PER_SECOND, + time_svmlight + ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime)/CLOCKS_PER_SECOND, + ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime)/CLOCKS_PER_SECOND); + fprintf(stderr,"* ========================================================================\n\n"); + } +} + +/**************************************************************/ + +/* + * Recorre el texto de izquierda a derecha seleccionando ejemplos + */ +int learner::learnerLeftToRight(simpleList* featureList, simpleList* featureListUnk, dictionary *dKnown, dictionary *dUnknown, int numWrds, int inicio) +{ + std::cerr << "learner::learnerLeftToRight" << std::endl; + bool ret = true; + + if ( inicio == -1 ) while (sw->previous()); + else if (sw->getIndex()->ord!=inicio) + { + while(ret && sw->getIndex()->ord != inicio) + { + if (inicio < sw->getIndex()->ord ) ret = sw->previous(); + if (inicio > sw->getIndex()->ord ) ret = sw->next(); + } + } + nodo *elem = sw->getIndex(); + numWrds--; + learnerGenerateFeatures(elem,featureList,dKnown, LEFT_TO_RIGHT); + learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown, dUnknown, LEFT_TO_RIGHT); + + while(numWrds>=0) + { + if ( !sw->next() ) return numWrds; + + elem = sw->getIndex(); + numWrds--; + learnerGenerateFeatures(elem,featureList,dKnown,LEFT_TO_RIGHT); + learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown, dUnknown, LEFT_TO_RIGHT); + } + + return (numWrds); +} + +/**************************************************************/ + +/* + * Recorre el texto de derecha a izquierda seleccionando ejemplos + */ +int learner::learnerRightToLeft(simpleList* featureList, simpleList* featureListUnk, dictionary *dKnown, dictionary *dUnknown, int numWrds, int inicio) +{ + std::cerr << "learner::learnerRightToLeft" << std::endl; + bool ret = true; + + if ( inicio == -1 ) while (sw->next()); + else if ( sw->getIndex()->ord != inicio ) + { + while(ret && sw->getIndex()->ord != inicio) + { + if (inicio < sw->getIndex()->ord ) ret = sw->previous(); + if (inicio > sw->getIndex()->ord ) ret = sw->next(); + } + } + + nodo *elem = sw->getIndex(); + numWrds--; + learnerGenerateFeatures(elem,featureList,dKnown,RIGHT_TO_LEFT); + learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown,dUnknown,RIGHT_TO_LEFT); + + while( numWrds>=0 ) + { + if ( !sw->previous() ) return numWrds; + + elem = sw->getIndex(); + numWrds--; + learnerGenerateFeatures(elem,featureList,dKnown, RIGHT_TO_LEFT); + learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown, dUnknown, RIGHT_TO_LEFT); + } + + return numWrds; +} + +/**************************************************************/ +/* + * Esta función recibe como parámetros: + * el apuntador a un nodo de la ventana (elem), + * una pila donde apilara los atributos generados (stk), + * la lista de atributos que debe generar (featureList), + * el diccionario con la información necesaria para el cálculo de features (d) + * y la dirección en que se recorre el corpus (direction). + * Recorre la lista featureList y ejecuta los métodos necesarios + * de la ventana (swindow) para generar los atributos y que al final + * de la ejecución de este método esten apilados en stk. +*/ +void learner::learnerGetFeatures(nodo* elem, std::stack& stk, dictionary* d, simpleList* featureList, int direction) +{ + nodo_feature_list* aux = NULL; + bool ret = true; + //Recorre la lista de atributos y crea los atributos correspondientes + while (ret) + { + aux = *featureList->getIndex(); + if (aux->mark == SLASTW) sw->winPushSwnFeature(stk); + else if (aux->mark == WMARK) sw->winPushWordFeature((void *)aux,d,stk,direction); + else if (aux->mark == KMARK) sw->winPushAmbiguityFeature((void *)aux,d,stk,direction); + else if (aux->mark == MMARK) sw->winPushMaybeFeature((void *)aux,d,stk,direction); + else if (aux->mark == PMARK) sw->winPushPosFeature((void *)aux,d,stk,direction); + else if (aux->mark == MFTMARK) sw->winPushMFTFeature((void *)aux,d,stk,direction); + else + { + int *param; + if (!aux->l.isEmpty()) + { + param = (int *) aux->l.getIndex(); + } + if (aux->mark == PREFIX_MARK) sw->winPushPrefixFeature(elem->wrd, stk, *param); + else if (aux->mark == SUFFIX_MARK) sw->winPushSuffixFeature(elem->wrd, stk, *param); + else if (aux->mark == CHAR_A_MARK) sw->winPushLetterFeature(elem->wrd, stk, COUNTING_FROM_BEGIN, *param); + else if (aux->mark == CHAR_Z_MARK) sw->winPushLetterFeature(elem->wrd, stk, COUNTING_FROM_END, *param); + else if (aux->mark == LENGTH_MARK) sw->winPushLenghtFeature(elem->wrd,stk); + else if (aux->mark == START_CAPITAL_MARK) sw->winPushStartWithCapFeature(elem->wrd,stk); + else if (aux->mark == START_LOWER_MARK) sw->winPushStartWithLowerFeature(elem->wrd,stk); + else if (aux->mark == START_NUMBER_MARK) sw->winPushStartWithNumberFeature(elem->wrd,stk); + else if (aux->mark == ALL_UPPER_MARK) sw->winPushAllUpFeature(elem->wrd,stk); + else if (aux->mark == ALL_LOWER_MARK) sw->winPushAllLowFeature(elem->wrd,stk); + else if (aux->mark == CONTAIN_CAP_MARK) sw->winPushContainCapFeature(elem->wrd, stk); + else if (aux->mark == CONTAIN_CAPS_MARK) sw->winPushContainCapsFeature(elem->wrd, stk); + else if (aux->mark == CONTAIN_COMMA_MARK) sw->winPushContainCommaFeature(elem->wrd, stk); + else if (aux->mark == CONTAIN_NUMBER_MARK) sw->winPushContainNumFeature(elem->wrd, stk); + else if (aux->mark == CONTAIN_PERIOD_MARK) sw->winPushContainPeriodFeature(elem->wrd, stk); + else if (aux->mark == MULTIWORD_MARK) sw->winPushMultiwordFeature(elem->wrd, stk); + } + ret = featureList->next(); + } + featureList->setFirst(); +} + +/**************************************************************/ + +/* + * El objetivo de este m�todo es seleccionar o descartar una palabra para + * realizar en el entrenamiento de palabras desconocidas, calcular respectivos + * atributos e insertar esta informaci�n en el fichero de ejemplos correspondiente. + */ +void learner::learnerGenerateFeaturesUnk(nodo *elem, simpleList* featureList,dictionary *d, dictionary *dUnk, int direction) +{ + std::stack stk; +// nodo_feature_list *aux; + int is_selected = FALSE; + std::string feature; + std::string buffer; + + if (d==NULL || elem==NULL || featureList==NULL) return; + +// init_stack(&stk); + + buffer = elem->wrd + ":" + elem->comment; + + dataDict* i = dUnk->getElement(elem->wrd); + if ((long)i!=HASH_FAIL) + { + /* + int i2 = d->getElement(elem->wrd); + if (dUnk->getElementNumMaybe(i) == 1 && hash_lookup(learnerUNKP_H,d->getMFT(i2))!=HASH_FAIL ) + { + fprintf(fUnknown,buffer,strlen(buffer)); + is_selected = TRUE; + } + */ + } + else + { + fprintf(fUnknown,buffer.c_str(),buffer.size()); + is_selected = TRUE; + } + + if ( is_selected == TRUE) learnerGetFeatures(elem, stk,dUnk, featureList, direction ); + + while (!stk.empty() && is_selected == TRUE) + { + feature = stk.top(); + stk.pop(); + buffer = std::string(" ") + feature; + fprintf(fUnknown,buffer.c_str(),buffer.size()); + } + + if (is_selected == TRUE) fprintf(fUnknown,"\n"); + + elem->pos = elem->comment; + } + +/**************************************************************/ + +/* + * El objetivo de este m�todo es seleccionar o descartar una palabra para + * realizar en el entrenamiento de palabras conocidas, calcular respectivos + * atributos e insertar esta informaci�n en el fichero de ejemplos correspondiente. + */ +void learner::learnerGenerateFeatures(nodo *elem, simpleList* featureList,dictionary *d, int direction) +{ + std::stack stk; +// nodo_feature_list *aux; + int is_selected = FALSE; +// int is_unk = FALSE; + std::string feature; + std::ostringstream buffer; + + if (d==NULL || elem==NULL || featureList==NULL) return; + +// init_stack(&stk); + + buffer << elem->wrd<<":"<comment; + + dataDict* i = d->getElement(elem->wrd.c_str()); + if ((long)i!=HASH_FAIL) + { + + if ( d->getElementNumMaybe(i)>1 && (long)(learnerAMBP_H->hash_lookup(d->getMFT(i)->pos))!=HASH_FAIL ) + { + fprintf(fKnown,buffer.str().c_str(),buffer.str().size()); + is_selected = TRUE; + } + } + + if ( is_selected == TRUE) learnerGetFeatures(elem, stk,d, featureList, direction ); + + while (!stk.empty() && is_selected == TRUE) + { + feature = stk.top(); + stk.pop(); + buffer.clear(); + buffer << " " << feature; + fprintf(fKnown,buffer.str().c_str(),buffer.str().size()); + } + buffer.clear(); + buffer << std::endl; + + if ( is_selected == TRUE) + fprintf(fKnown,buffer.str().c_str(),buffer.str().size()); + + elem->pos = elem->comment; +} + +/************************************************************/ +/* + * Ejecuta SVM-light. Recibe como parámetros cuatro cadenas de + * caracteres: svmdir es el directorio en el que se encuentra SVM-light, + * options son las opciones con que se lanzará SVM-light, posFile el nombre + * del fichero de ejemplos usado como entrada para la herramienta + * de Joachims y, por último, outFile que es el nombre del fichero + * de salida. Esta función devuelve 0. +*/ +int learner::learnerExecSVMlight(const std::string& svmdir, const std::string& options, const std::string& posFile, const std::string& outFile) +{ + time_t begin, finish; + + begin = time (0); + + std::string command; + command = svmdir + "/svm_learn -v 0 "+options+" "+posFile+" "+outFile; + + if ( verbose == TRUE ) fprintf(stderr,"Executing Joachims svm_light [ with options: %s ] ",options.c_str()); + system(command.c_str()); + if ( verbose == TRUE ) fprintf(stderr," [DONE]"); + + finish = time(0); + + time_svmlight = difftime(finish,begin) + time_svmlight; + + return 0; +} + +/**************************************************************/ + +simpleList* learner::learnerTransformHashInList(hash_t *tptr) +{ +// hash_node_t *node, *last; +// int i; + simpleList* l = new simpleList(); + + for (hash_t::iterator it = tptr->begin(); it != tptr->end(); it++) + { + infoDict *p = ((*it).second); + l->add(p); + } + + l->setFirst(); + return l; +} + +/**************************************************************/ + +/* + * Calcularemos el número de fragmentos en el que se ha de dividir + * el corpus para conseguir un porcentaje de palabras desconocidas + * determinado. Lo parámetros de entrada son el nombre del fichero + * de entrenamiento (trainingFileName), el porcentaje de palabras + * desconocidas deseado (percentage) y el número de frases del corpus + * (nSentences). El valor devuelto es un entero indicando el número de + * fragmentos. Si el número de fragmentos calculado es mayor que el número + * de frases que contiene el corpus de entrenamiento se devolverá como el + * número de frases (nSentences). + */ +int learner::learnerNumChunks(const std::string& trainingFileName,float /*percentage*/,int nSentences) +{ + int ret=0,ndwords=0,nwords=0; + std::string wrd; + + FILE *f = openFile (trainingFileName.c_str(),"r"); + hash_t h1; + h1.hash_init(10000); + while (!feof(f)) + { + ret = readTo(f,' ','\n',wrd); + if (ret>=0) + { + nwords++; + std::string w(wrd); + //if ((int)hash_insert(&h1,w,(int) w)==HASH_FAIL) ndwords++; + if ((int)h1.hash_insert(w,atoi(w.c_str()))==HASH_FAIL) ndwords++; + if (ret>0) readTo(f,'\n','\n',wrd); + } + } + + //Read again until a certain point where X is met --> $ndwords * (100 - $X) / 100; + float meeting = ndwords * (100 - X ) /100; + int nwords2=0,ndwords2=0; + fseek(f,0,SEEK_SET); + hash_t h2; + h2.hash_init(10000); + while (!feof(f) && ndwords2=0) + { + nwords2++; + std::string w(wrd); + //if (hash_insert(&h2,w,(int) w)==HASH_FAIL) ndwords2++; + if (h2.hash_insert(w,atoi(w.c_str()))==HASH_FAIL) ndwords2++; + if (ret>0) readTo(f,'\n','\n',wrd); + } + } + + int chunks = nwords/(nwords - nwords2); + if (nSentences<=chunks) chunks = nSentences; + + fclose (f); + + h2.hash_destroy(); + h1.hash_destroy(); + return chunks; +} + +/**************************************************************/ + +bool learner::learnerIsInsideList(simpleList* l, const std::string& key) +{ + if (l==NULL || key.empty()) return false; + int ret = 0; + while (ret>=0) + { + infoDict* ptr = (infoDict *)l->getIndex(); + if (key == ptr->pos) + { + return true; + } + } + l->setFirst(); + return false; +} diff --git a/src/list.cc b/src/list.cc deleted file mode 100644 index 94ed080..0000000 --- a/src/list.cc +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include "list.h" -#include "common.h" - -/****************************************************************************/ -/* Simple List */ -/****************************************************************************/ -void simpleList::deleteList() -{ - int cont = numObj; - - if (first==NULL) return; - listNode *aux=first; - - while (first->next!=NULL && cont >= 1) - { - aux = first; - first = first->next; - cont = cont - 1; - delete aux; - } - - delete last; - numObj = 0; - first = NULL; - last = NULL; - index = NULL; -} - - -/****************************************************************************/ - -simpleList::~simpleList() -{ - deleteList(); -} - - -/****************************************************************************/ - -simpleList::simpleList() -{ - numObj = 0; - first = NULL; - last = NULL; - index = NULL; -} - - -/****************************************************************************/ - -/*Move Interest Point to next element */ -int simpleList::next() -{ - if ((index == NULL) || (index->next == NULL)) return -1; - index = index->next; - return 0; -} - - -/****************************************************************************/ - -/* Move Interest Point to previous element */ -int simpleList::previous() -{ - if ((index==NULL) || (index->previous==NULL)) return -1; - index = index->previous; - return 0; -} - - -/****************************************************************************/ - -/* Get Interest Point */ -void *simpleList::getIndex() -{ - if ( index == NULL ) return NULL; - else return index->data; -} - - -/****************************************************************************/ - -/* Get Interest Point */ -void *simpleList::getFirst() -{ - return first->data; -} - - -/****************************************************************************/ - -void *simpleList::getLast() -{ - return last->data; -} - - -/****************************************************************************/ - -void simpleList::setFirst() -{ - index = first; -} - - -/****************************************************************************/ - -void *simpleList::get(int position) -{ - listNode *aux; - int i; - - if (numObj == 0 || position >= numObj) - return NULL; - - aux = first; - - for(i=0; inext != NULL) aux = aux->next; - else return NULL; - } - return aux->data; -} - - -/****************************************************************************/ - -/* Show list elements */ -int simpleList::show() -{ - if (first==NULL) return 0; - - listNode *actual=first; - - while (actual->next!=NULL) - { - actual=actual->next; - - } - return 0; -} - - -/****************************************************************************/ - -int simpleList::add(void *object) -{ - listNode *aux = new listNode; - - if(numObj == 0) - { - aux->previous=NULL; - first = aux; - last = aux; - index = aux; - } - else - { - aux->previous = last; - last->next = aux; - last = aux; - } - - aux->ord = numObj; - aux->data = object; - aux->next=NULL; - numObj++; - return numObj; -} - - -/****************************************************************************/ - -int simpleList::delIndex() -{ - listNode *aux = index; - - if(numObj == 0) return -1; - - if (index==last && index==first) - { - first = aux->next; - aux->previous = NULL; - index = first; - last = aux->previous; - last->next = NULL; - index = last; - } - else if (index==first) - { - - first = aux->next; - first->previous = NULL; - index = first; - } - else if (index==last) - { - last = aux->previous; - last->next = NULL; - index = last; - } - else - { - aux->previous->next = aux->next; - aux->next->previous = aux->previous; - } - - numObj--; - delete aux; - return numObj; -} - - -/****************************************************************************/ - -int simpleList::isEmpty() -{ - if (numObj == 0 || first == NULL) return TRUE; - else return FALSE; - -} - - -/****************************************************************************/ - -int simpleList::numElements() -{ - return numObj; -} diff --git a/src/mapping.cc b/src/mapping.cc old mode 100644 new mode 100755 dissimilarity index 63% index ce34e52..9fe35b6 --- a/src/mapping.cc +++ b/src/mapping.cc @@ -1,364 +1,281 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include "common.h" -#include "hash.h" -#include "mapping.h" - -/**************************************************/ - -extern int verbose_svmtool; - -/**************************************************/ - -/* - * La clase mapping tiene como principal objetivo realizar - * la conversin de atributo a identificador numrico y viceversa. - * Este objeto se utiliza principalmente en tiempo de aprendizaje. - * - * Los atributos que contiene la clase mapping son: - * mapByKey hash_t * Hashing para permitir el acceso por atributo - * mapByNumber hash_t * Hash que nos permite acceso por identificador - * numrico o clave numrica (Numeric key) - * mapping_counter int Contador de elementos que contiene el mapping - */ - -/**************************************************/ - -/* - * Cada elemento insertado en el mapping es del tipo mapping_node_t. - * Este objeto est compuesto por: - * - * feature char * Atributo almacenado - * number char * Nmero identificativo de el atributo en formato alfanumrico - * num Int Nmero de veces que aparece - */ -class mapping_node_t -{ - public: - char *feature; - char *number; - int num; - - ~mapping_node_t() - { - delete feature; - delete number; - } -}; - -/**************************************************/ - -mapping::mapping() -{ - mapByKey = new hash_t; - mapByNumber = new hash_t; - hash_init(mapByKey,100000); - hash_init(mapByNumber,100000); - mapping_counter = 1; -} - - -/**************************************************/ - -mapping::~mapping() -{ - // Se elimina hash de mapeo por Id. numerico - hash_destroy(mapByNumber); - delete mapByNumber; - - // se eliminan todos los elementos insertados - int ret=0,numdel=0; - mapping_node_t *aux; - - hash_t *tptr = mapByKey; - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - //Recorremos todas las listas de sinonimos el hash - //eliminando el contenido - for (i=0; inext; - aux = (mapping_node_t *) tmp->data; - delete aux; - - } /* while */ - } /* for */ - - hash_destroy(mapByKey); - delete mapByKey; -} - - -/**************************************************/ -/* - * Esta funcin aade un atributo al hashing. Recibe como parmetro - * el atributo, key. Devuelve un entero, que es el nmero que - * identifica una palabra. - * - * Para aadir un atributo al hash se comprueba si ya existe en - * el mapByKey. Si ya existe se incrementa el nmero de veces que - * aparece el atributo. Si no existe se asigna al atributo un nmero - * secuencial y se inserta en el mapByKey y en mapByNumber. - */ -int mapping::mappingAddByKey(const char *key) -{ - char strTmp[100]; - - mapping_node_t *tmp = (mapping_node_t *) hash_lookup(mapByKey,key); - - if ((uintptr_t)tmp == HASH_FAIL) - { - tmp = new mapping_node_t; - strcpy(strTmp,""); - tmp->feature = new char[strlen(key)+1]; - strcpy(tmp->feature,key); - //bynum - sprintf(strTmp,"%d",mapping_counter); - tmp->number = new char[strlen(strTmp)+1]; - strcpy(tmp->number,strTmp); - mapping_counter++; //endbynum - tmp->num=1; - hash_insert(mapByKey, tmp->feature, (uintptr_t) tmp); - //add by num - hash_insert(mapByNumber, tmp->number, (uintptr_t) tmp); - } - else tmp->num++; - return atoi(tmp->number); -} - - -/**************************************************/ -/* - * Parmetro de entrada el atributo, key, para la que se desea asignar - * e insertar un identificador numrico. - * Mira si existe la clave recibida como parmetro en mapByKey. - * Si no existe se le asigna un identificador numrico y se almacena en mapByNumber. - * Devuelve -1 si no existe key, identificador numrico asignado en caso contrario. - */ -int mapping::mappingAddNumber(const char *key) -{ - char strTmp[100]; - int ret = -1; - mapping_node_t *tmp = (mapping_node_t *) hash_lookup(mapByKey,key); - - if ((uintptr_t)tmp != HASH_FAIL) - { - strcpy(strTmp,""); - sprintf(strTmp,"%d",mapping_counter); - tmp->number = new char[strlen(strTmp)+1]; - strcpy(tmp->number,strTmp); - mapping_counter++; - hash_insert(mapByNumber, tmp->number, (uintptr_t) tmp); - ret = atoi(tmp->number); - } - - return ret; -} - - -/****************************************************************************/ -/* - * Busca el atributo, key, en el hashing correspondiente. - * Devuelve HASH_FAIL si no se encuentra key. En caso contrario, - * devuelve el identificador numrico. - */ -int mapping::mappingGetNumberByFeature(const char *key) -{ - mapping_node_t *tmp = (mapping_node_t *)hash_lookup(mapByKey,key); - if ( ((uintptr_t) tmp) == HASH_FAIL) return HASH_FAIL; - return atoi(tmp->number); -} - - -/****************************************************************************/ - -/* - * Busca el identificador numrico, key, en el hashing correspondiente. - * Devuelve HASH_FAIL si no se encuentra key. En caso contrario, - * devuelve el atributo correspondiente. - */ -char *mapping::mappingGetFeatureByNumber(const char *key) -{ - mapping_node_t *tmp = (mapping_node_t *) hash_lookup(mapByNumber,key); - if ( ((uintptr_t) tmp) == HASH_FAIL) return (char *) HASH_FAIL; - return tmp->feature; -} - - -/****************************************************************************/ - -/* - * Esta funcin recibe como parmetros el tamao mximo de entradas - * que puede tener el mapping (max_mapping_size) y el nmero mnimo - * de veces que ha de aparecer cada entrada de la tabla (count_cut_off). - * Se recorren los objetos insertados siempre i cuando el nmero de - * elementos contenidos por este objeto sea mayor a max_mapping_size. - * Se eliminan del mapping todas aquellas entradas que hayan aparecido - * count_cut_off veces o menos. - * Tras recorrer todas las entradas del mapping si el nmero de - * entradas del hashing sigue siendo mayor al tamao mximo permitido, - * max_mapping_size, se llama a - * - * mappingRepair(max_mapping_size,count_cut_off+1) - * - * Devuelve el nmero de entradas borradas. - */ -int mapping::mappingRepair(int max_mapping_size, int count_cut_off) -{ - int ret=0,numdel=0; - mapping_node_t *aux; - - hash_t *tptr = mapByKey; - - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - - if ( verbose_svmtool == TRUE) - fprintf(stderr,"\n\tReducing Mapping (%d) --> size (%d)",count_cut_off,this->mappingNumElements()); - - old_bucket=tptr->bucket; - old_size=tptr->size; - if (max_mapping_sizemappingNumElements()) - { - for (i=0; inext; - aux = (mapping_node_t *) tmp->data; - - if (aux->num < count_cut_off) - { - hash_delete(mapByKey,aux->feature); - - hash_delete(mapByNumber,aux->number); - delete aux; - aux = NULL; - mapping_counter--; numdel++; - } - } /* while */ - } /* for */ - } /* if */ - - if ( verbose_svmtool == TRUE) - fprintf(stderr," - deleted (%d) = Final size (%d)",numdel,this->mappingNumElements()); - - int number=0; - if (max_mapping_sizemappingNumElements()) number = mappingRepair(max_mapping_size,count_cut_off+1); - return (number+numdel); -} - - -/****************************************************************************/ -/* - * Devuelve el nmero de elementos insertados en el objeto. - */ -int mapping::mappingNumElements() -{ - return mapping_counter-1; -} - - -/****************************************************************************/ -/* - * Este mtodo recibe el apuntador al fichero (f) desde el cual - * se crea el mapping, el nmero mnimo de veces que puede aparecer - * una palabra en el mapping (count_cut_off) i el tamao mximo del - * mapping (max_mapping_size). - * Se recorre el fichero aadiendo todos los atributos encontrados. - * Despus se repara el mapping llamando a mappingRepair. - */ -void mapping::mappingBuilt(FILE *f,int max_mapping_size, int count_cut_off) -{ - char str[200]=""; - int contador=0; - - fseek(f,0,SEEK_SET); - while (!feof(f)) - { - int ret1 = readTo(f,' ',0,str); - while (ret1>0) - { - ret1 = readTo(f,' ','\n',str); - this->mappingAddByKey(str); - contador++; - } - if ( verbose_svmtool == TRUE ) showProcessDone(contador,300,FALSE,"features"); - } - if ( verbose_svmtool == TRUE ) showProcessDone(contador,300,TRUE,"features"); - - if (this->mappingNumElements()>max_mapping_size) - { - if ( verbose_svmtool == TRUE ) fprintf(stderr,"REDUCING MAPPING: "); - int numdel = this->mappingRepair(max_mapping_size,count_cut_off); - } -} - - -/****************************************************************************/ - -/* - * Guarda los datos del mapping en el fichero con nombre, fName. - * Si el parmetro onlyFeatures vale TRUE escribe slo los atributos. - * En otro caso, escribe el identificador numrico, el atributo i - * el nmero de veces que aparece. - */ -void mapping::mappingWrite(char *fName,int onlyFeatures) -{ - FILE *f = openFile(fName,"w"); - int ret=0,numdel=0; - mapping_node_t *aux; - - hash_t *tptr = mapByKey; - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - for (i=0; inext; - aux = (mapping_node_t *) tmp->data; - - if (onlyFeatures == TRUE) fprintf(f,"%s\n",aux->feature); - else fprintf(f,"%s %s %d\n",aux->number,aux->feature, aux->num); - - } /* while */ - } /* for */ - - fclose (f); -} +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include "common.h" +#include "hash.h" +#include "mapping.h" + +/**************************************************/ + +extern int verbose; + +/**************************************************/ + +/* + * La clase mapping tiene como principal objetivo realizar + * la conversión de atributo a identificador numérico y viceversa. + * Este objeto se utiliza principalmente en tiempo de aprendizaje. + * + * Los atributos que contiene la clase mapping son: + * mapByKey hash_t * Hashing para permitir el acceso por atributo + * mapByNumber hash_t * Hash que nos permite acceso por identificador + * numérico o clave numérica (Numeric key) + * mapping_counter int Contador de elementos que contiene el mapping + */ + +/**************************************************/ + +/**************************************************/ + +mapping::mapping() +{ + mapByKey = new hash_t; + mapByNumber = new hash_t; + mapByKey->hash_init(100000); + mapByNumber->hash_init(100000); + mapping_counter = 1; +} + +/**************************************************/ + +mapping::~mapping() +{ + // Se elimina hash de mapeo por Id. numerico + mapByNumber->hash_destroy(); + delete mapByNumber; + + // se eliminan todos los elementos insertados +// int ret=0,numdel=0; + +// int old_size, h, i; + + mapByKey->hash_destroy(); + delete mapByKey; +} + +/**************************************************/ +/* + * Esta función añade un atributo al hashing. Recibe como parámetro + * el atributo, key. Devuelve un entero, que es el número que + * identifica una palabra. + * + * Para añadir un atributo al hash se comprueba si ya existe en + * el mapByKey. Si ya existe se incrementa el número de veces que + * aparece el atributo. Si no existe se asigna al atributo un número + * secuencial y se inserta en el mapByKey y en mapByNumber. + */ +int mapping::mappingAddByKey(const char *key) +{ + char strTmp[100]; + + mapping_node_t *tmp = mapByKey->hash_lookup(key); + + if ((long)tmp == HASH_FAIL) + { + tmp = new mapping_node_t; + strcpy(strTmp,""); + tmp->feature = new char[strlen(key)+1]; + strcpy(tmp->feature,key); + sprintf(strTmp,"%d",mapping_counter); //bynum + tmp->number = new char[strlen(strTmp)+1]; + strcpy(tmp->number,strTmp); + mapping_counter++; //endbynum + tmp->num=1; + mapByKey->hash_insert(tmp->feature, tmp); + mapByNumber->hash_insert(tmp->number, tmp); //add by num + } + else tmp->num++; + return atoi(tmp->number); +} + +/**************************************************/ +/* + * Parámetro de entrada el atributo, key, para la que se desea asignar + * e insertar un identificador numérico. + * Mira si existe la clave recibida como parámetro en mapByKey. + * Si no existe se le asigna un identificador numérico y se almacena en mapByNumber. + * Devuelve -1 si no existe key, identificador numérico asignado en caso contrario. + */ +int mapping::mappingAddNumber(const char *key) +{ + char strTmp[100]; + int ret = -1; + mapping_node_t *tmp = mapByKey->hash_lookup(key); + + if ((long)tmp != HASH_FAIL) + { + strcpy(strTmp,""); + sprintf(strTmp,"%d",mapping_counter); + tmp->number = new char[strlen(strTmp)+1]; + strcpy(tmp->number,strTmp); + mapping_counter++; + mapByNumber->hash_insert(tmp->number, tmp); + ret = atoi(tmp->number); + } + + return ret; +} + +/****************************************************************************/ +/* + * Busca el atributo, key, en el hashing correspondiente. + * Devuelve HASH_FAIL si no se encuentra key. En caso contrario, + * devuelve el identificador numérico. + */ +int mapping::mappingGetNumberByFeature(const char *key) +{ + mapping_node_t *tmp = mapByKey->hash_lookup(key); + if ( ((long) tmp) == HASH_FAIL) return HASH_FAIL; + return atoi(tmp->number); +} + +/****************************************************************************/ + +/* + * Busca el identificador numérico, key, en el hashing correspondiente. + * Devuelve HASH_FAIL si no se encuentra key. En caso contrario, + * devuelve el atributo correspondiente. + */ +char *mapping::mappingGetFeatureByNumber(const char *key) +{ + mapping_node_t *tmp = mapByNumber->hash_lookup(key); + if ( ((long) tmp) == HASH_FAIL) return (char *) HASH_FAIL; + return tmp->feature; +} + +/****************************************************************************/ + +/* + * Esta función recibe como parámetros el tamaño máximo de entradas + * que puede tener el mapping (max_mapping_size) y el número mínimo + * de veces que ha de aparecer cada entrada de la tabla (count_cut_off). + * Se recorren los objetos insertados siempre i cuando el número de + * elementos contenidos por este objeto sea mayor a max_mapping_size. + * Se eliminan del mapping todas aquellas entradas que hayan aparecido + * count_cut_off veces o menos. + * Tras recorrer todas las entradas del mapping si el número de + * entradas del hashing sigue siendo mayor al tamaño máximo permitido, + * max_mapping_size, se llama a + * + * mappingRepair(max_mapping_size,count_cut_off+1) + * + * Devuelve el número de entradas borradas. + */ +int mapping::mappingRepair(int max_mapping_size, int count_cut_off) +{ + int numdel=0; + if ( verbose == TRUE) + fprintf(stderr,"\n\tReducing Mapping (%d) --> size (%d)",count_cut_off,this->mappingNumElements()); + + if (max_mapping_sizemappingNumElements()) + { + for (hash_t::iterator it = mapByKey->begin(); it != mapByKey->end(); it++) + { + mapping_node_t *aux = (*it).second; + + if (aux->num < count_cut_off) + { + mapByKey->hash_delete(aux->feature); + + mapByNumber->hash_delete(aux->number); + delete aux; + aux = NULL; + mapping_counter--; numdel++; + } + } /* for */ + }/* if */ + + if ( verbose == TRUE) + fprintf(stderr," - deleted (%d) = Final size (%d)",numdel,this->mappingNumElements()); + + int number=0; + if (max_mapping_sizemappingNumElements()) number = mappingRepair(max_mapping_size,count_cut_off+1); + return (number+numdel); +} + +/****************************************************************************/ +/* + * Devuelve el número de elementos insertados en el objeto. + */ +int mapping::mappingNumElements() +{ +return mapping_counter-1; +} + +/****************************************************************************/ +/* + * Este método recibe el apuntador al fichero (f) desde el cual + * se crea el mapping, el número mínimo de veces que puede aparecer + * una palabra en el mapping (count_cut_off) i el tamaño máximo del + * mapping (max_mapping_size). + * Se recorre el fichero añadiendo todos los atributos encontrados. + * Después se repara el mapping llamando a mappingRepair. + */ +void mapping::mappingBuilt(FILE *f,int max_mapping_size, int count_cut_off) +{ + std::string str; + int contador=0; + + fseek(f,0,SEEK_SET); + while (!feof(f)) + { + int ret1 = readTo(f,' ',0,str); + while (ret1>0) + { + ret1 = readTo(f,' ','\n',str); + this->mappingAddByKey(str.c_str()); + contador++; + } + if ( verbose == TRUE ) showProcessDone(contador,300,FALSE,"features"); + } + if ( verbose == TRUE ) showProcessDone(contador,300,TRUE,"features"); + + if (this->mappingNumElements()>max_mapping_size) + { + if ( verbose == TRUE ) fprintf(stderr,"REDUCING MAPPING: "); + /*int numdel = */this->mappingRepair(max_mapping_size,count_cut_off); + } +} + +/****************************************************************************/ + +/* + * Guarda los datos del mapping en el fichero con nombre, fName. + * Si el parámetro onlyFeatures vale TRUE escribe sólo los atributos. + * En otro caso, escribe el identificador numérico, el atributo i + * el número de veces que aparece. + */ +void mapping::mappingWrite(const char *fName,int onlyFeatures) +{ + FILE *f = openFile(fName,"w"); + + for (hash_t::iterator it = mapByKey->begin(); it != mapByKey->end(); it++) + { + mapping_node_t *aux = (*it).second; + if (onlyFeatures == TRUE) fprintf(f,"%s\n",aux->feature); + else fprintf(f,"%s %s %d\n",aux->number,aux->feature, aux->num); + } /* for */ + + fclose (f); +} + + + diff --git a/src/reader.cc b/src/reader.cc new file mode 100644 index 0000000..fa36a5d --- /dev/null +++ b/src/reader.cc @@ -0,0 +1,129 @@ +/* + * Author: Quentin Pradet + * Copyright (C) 2011 CEA LIST + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include "reader.h" +#include "nodo.h" +#include "er.h" + +#include +#include +#include + +int reader::nextNode(nodo **node) { + std::string word, comment; + std::set tags; + + int ret; + + while((ret = parseWord(word, tags, comment)) == 1); + + if(ret == -2) { + *node = NULL; + } + else { + *node = buildNode(word, comment); + } + + + return ret; +} + + +nodo* reader::buildNode(std::string &word, std::string &comment) +{ + nodo *node = new nodo; + + // wrd and realWrd + node->realWrd = word; + int erRet=erLookRegExp(word); + switch (erRet) + { + case CARD: node->wrd ="@CARD"; break; + case CARDSEPS: node->wrd = "@CARDSEPS"; break; + case CARDPUNCT: node->wrd = "@CARDPUNCT"; break; + case CARDSUFFIX: node->wrd = "@CARDSUFFIX"; break; + default: node->wrd = word; + } + + // comment + node->comment = comment; + + return node; +} + +int reader::parseWord(std::string& token, std::set tagset, std::string &comment) +{ + if (m_input.eof()) { + is_good = false; + return -2; + } + + // read the line + std::string line, tags; + getline(m_input, line); + + // checking for commented lines + if (line.size() >= 2 && line[0] == '#' && line[1] == '#') + return 1; + + // reading content: word (tag,tag,tag) + std::istringstream iss(line); + iss >> token >> tags; + + if(token.empty()) + return -2; + + + // are tags real tags or only a comment? + if(!tags.empty() && tags[0] != '(') { + // parse the tags + + // remove parentheses around tags list + tags = tags.substr(1,tags.size()-2); + int i = 0; + std::string::size_type j = tags.find_first_of(','); + while (j != std::string::npos) + { + std::string tag = tags.substr(i,j-i); + tagset.insert(tag); + i = j+1; + j = tags.find_first_of(',',i); + } + std::string tag = tags.substr(i); + tagset.insert(tag); + } else { + // tags is in fact only a comment + swap(comment, tags); + comment += line_end(iss); + } + + // TODO this should be configurable - does not work in every language + if (token == "." || token == "?" || token == "!") + return -1; + + return 0; +} + +std::string reader::line_end(std::istringstream& iss) { + std::string result; + getline(iss, result); + return result; +} + diff --git a/src/stack.cc b/src/stack.cc old mode 100644 new mode 100755 index 3ba2d52..cf23dd6 --- a/src/stack.cc +++ b/src/stack.cc @@ -5,7 +5,7 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU @@ -23,18 +23,17 @@ /****************************************************************************/ /* - * empty -- Indica si la pila est vaca o no + * empty -- Indica si la pila est  vac¡a o no * Parametros: * *ps: puntero a la pila * Devuelve: - * TRUE si est vaca FALSE si no lo est + * TRUE si est  vac¡a FALSE si no lo est  */ boolean empty(struct stack_t *ps) { return((boolean)(ps->top == -1)); } - /****************************************************************************/ /* @@ -47,23 +46,21 @@ void init_stack(struct stack_t *ps) ps->top = -1; } - /****************************************************************************/ /* - * pop -- Extrae el elemento del top de la pila si no est vaca + * pop -- Extrae el elemento del top de la pila si no est  vac¡a * Parametros: * *ps: puntero a la pila * Devuelve: - * El elemento del top de la pila si no est vaca + * El elemento del top de la pila si no est  vac¡a */ element_type pop(struct stack_t *ps) { - if (empty(ps)) return NULL; + if (empty(ps)) return NULL; return(ps->items[ps->top--]); } - /****************************************************************************/ /* @@ -74,8 +71,7 @@ element_type pop(struct stack_t *ps) */ void push(struct stack_t *ps, element_type x) { - if (ps->top == STACKSIZE -1) - { + if (ps->top == STACKSIZE -1) { fprintf(stderr,"Error: Stack Overflow. %d %d\n",ps->top,STACKSIZE-1); exit(1); } @@ -83,12 +79,11 @@ void push(struct stack_t *ps, element_type x) ps->items[++(ps->top)] = x; } - /****************************************************************************/ /* * stack_top -- Devuelve sin quitarlo de la pila el elemento que esta en el - * top de la misma, si no est vacia + * top de la misma, si no est  vacia * Parametros: * *ps: puntero a la pila * Devuelve: @@ -97,11 +92,10 @@ void push(struct stack_t *ps, element_type x) */ element_type stack_top(struct stack_t *ps) { - if (empty(ps)) return NULL; + if (empty(ps)) return NULL; return(ps->items[ps->top]); } - /****************************************************************************/ /* diff --git a/src/swindow.cc b/src/swindow.cc old mode 100644 new mode 100755 dissimilarity index 77% index cb652e3..767b9c8 --- a/src/swindow.cc +++ b/src/swindow.cc @@ -1,1274 +1,995 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include "hash.h" -#include "list.h" -#include "dict.h" -#include "weight.h" -#include "stack.h" -#include "swindow.h" -#include "er.h" -#include "common.h" -#include "marks.h" - -#define MAX_SENTENCE_LENGTH 1000 - -char lSentence[MAX_SENTENCE_LENGTH][250]; -int iLSentence = 0; - -/***************************************************************** - * Feature Generation - *****************************************************************/ - -void swindow::winPushStartWithLowerFeature(char *wrd,stack_t *pila) -{ - int startlower=0; - - //Comienza por Minuscula? - if (erLookRegExp2(&erStartLower,wrd)) - { - startlower = 1; - - //char *feat = new char[strlen(START_LOWER_MARK)+4]; - char *feat = new char[5];//mod Correcting dynamic memory errors - sprintf(feat,"%s:%d",START_LOWER_MARK,startlower); - - push(pila,feat); - } -} - - -void swindow::winPushStartWithNumberFeature(char *wrd,stack_t *pila) -{ - int startnumber=0; - - //Comienza por Numero? - if (erLookRegExp2(&erStartNumber,wrd)) - { - startnumber = 1; - - //mod Correcting dynamic memory errors - //char *feat = new char[strlen(START_NUMBER_MARK)+4]; - char *feat = new char[5]; - - sprintf(feat,"%s:%d",START_NUMBER_MARK,startnumber); - - push(pila,feat); - } -} - - -void swindow::winPushSuffixFeature(char *wrd, struct stack_t *pila,int longitud) -{ - //Obtenemos la longitud de la palabra - - char *feat = new char[longitud+6]; - int len = strlen(wrd); - char suf[longitud+1]; - //int a=0; - - strcpy(suf,""); - for (int i=len-longitud; i<=len-1; i++) - { - if (i>=0) sprintf(suf,"%s%c",suf,wrd[i]); - else sprintf(suf,"%s~",suf); - } - - sprintf(feat,"%s%d:%s",SUFFIX_MARK,longitud,suf); - push(pila,feat); -} - - -/* - * void winPushPreffixFeatures ( char *wrd, struct stack_t *pila, int longitud) - * esta funcion creara las "features" para la palabra desconocida - * y las apilara en en el parametro - */ -void swindow::winPushPrefixFeature(char *wrd, struct stack_t *pila,int longitud) -{ - //Obtenemos la longitud de la palabra - char *feat = new char[6+longitud]; - int len = strlen(wrd); - char pref[longitud+1]; - - strcpy(pref,""); - for (int i=0; i i) sprintf(pref,"%s%c",pref,wrd[i]); - else /*if (i > len-1 )*/ sprintf(pref,"%s~",pref); - } - - sprintf(feat,"%s%d:%s",PREFIX_MARK,longitud,pref); - push(pila,feat); -} - - -void swindow::winPushStartWithCapFeature(char *wrd, struct stack_t *pila) -{ - int startcap=0; - - //Comienza por Mayuscula? - if (erLookRegExp2(&erStartCap,wrd)) - { - startcap = 1; - //mod Correcting dynamic memory errors - //char *feat = new char[strlen(START_CAPITAL_MARK)+4]; - char *feat = new char[5]; - - sprintf(feat,"%s:%d",START_CAPITAL_MARK,startcap); - push(pila,feat); - } -} - - -void swindow::winPushAllUpFeature(char *wrd,stack_t *pila) -{ - int allup=0; - - //Esta toda la palabra en mayusculas? - if (erLookRegExp2(&erAllUp,wrd)) - { - allup = 1; - - //mod Correcting dynamic memory errors - //char *feat = new char[4]; - char *feat = new char[5]; - - sprintf(feat,"%s:%d",ALL_UPPER_MARK,allup); - push(pila,feat); - } -} - - -void swindow::winPushAllLowFeature(char *wrd,stack_t *pila) -{ - int alllow = 0; - //Esta toda la palabra en minusculas? - if (erLookRegExp2(&erAllLow,wrd)) - { - alllow = 1; - - //mod Correcting dynamic memory errors - //char *feat = new char[4]; - char *feat = new char[5]; - - sprintf(feat,"%s:%d",ALL_LOWER_MARK,alllow); - push(pila,feat); - } -} - - -void swindow::winPushContainCapFeature(char *wrd, stack_t *pila) -{ - int containcap = 0; - if (erLookRegExp2(&erContainCap,wrd)) - { - containcap = 1; - - //mod Correcting dynamic memory errors - //char *feat = new char[4]; - char *feat = new char[5]; - - sprintf(feat,"%s:%d",CONTAIN_CAP_MARK,containcap); - push(pila,feat); - } -} - - -void swindow::winPushContainCapsFeature(char *wrd, stack_t *pila) -{ - int containcaps = 0; - if (erLookRegExp2(&erContainCaps,wrd)) - { - containcaps = 1; - - //mod Correcting dynamic memory errors - //char *feat = new char[5]; - char *feat = new char[6]; - - sprintf(feat,"%s:%d",CONTAIN_CAPS_MARK,containcaps); - push(pila,feat); - } -} - - -void swindow::winPushContainPeriodFeature(char *wrd, stack_t *pila) -{ - int containperiod = 0; - //Contiene un punto? - if (erLookRegExp2(&erContainPeriod,wrd)) - { - containperiod = 1; - char *feat = new char[5]; - sprintf(feat,"%s:%d",CONTAIN_PERIOD_MARK,containperiod); - push(pila,feat); - } -} - - -void swindow::winPushContainCommaFeature(char *wrd, stack_t *pila) -{ - int containcomma = 0; - //Contiene un punto? - if (erLookRegExp2(&erContainComma,wrd)) - { - containcomma = 1; - char *feat = new char[5]; - sprintf(feat,"%s:%d",CONTAIN_COMMA_MARK,containcomma); - push(pila,feat); - } -} - - -void swindow::winPushContainNumFeature(char *wrd, stack_t *pila) -{ - int containnum = 0; - //Contiene un numero? - if (erLookRegExp2(&erContainNum,wrd)) - { - containnum = 1; - - char *feat = new char[5]; - //mod - //sprintf(feat,"CN:%d",containnum); - sprintf(feat,"%s:%d",CONTAIN_NUMBER_MARK,containnum); - - push(pila,feat); - } -} - - -void swindow::winPushMultiwordFeature(char *wrd, stack_t *pila) -{ - int multiword = 0; - //Es una palabra multiple? - if (erLookRegExp2(&erMultiWord,wrd)) - { - multiword = 1; - - //mod Correcting dynamic memory errors - //char *feat = new char[6]; - //sprintf(feat,"MW:%d",multiword); - char *feat = new char[5]; - sprintf(feat,"%s:%d",MULTIWORD_MARK,multiword); - - push(pila,feat); - } -} - - -void swindow::winPushLetterFeature(char *wrd , stack_t *pila, int position,int where) -{ - char *feature = new char[12]; - - if (COUNTING_FROM_END==where) - { - sprintf(feature,"%s%d:%c",CHAR_Z_MARK,position,wrd[strlen(wrd)-position]); - } - else - { - sprintf(feature,"%s%d:%c",CHAR_A_MARK,position,wrd[position-1]); - } - - push (pila,feature); -} - - -void swindow::winPushLenghtFeature(char *wrd, stack_t *pila) -{ - //Obtenemos la longitud de la palabra - int len = strlen(wrd); - - //Longitud de la palabra - //mod Correcting dynamic memory errors - //char *feat = new char[4]; - char *feat = new char[6]; - - sprintf(feat,"%s:%d",LENGTH_MARK,len); - push(pila,feat); -} - - -/* - * void winPushUnkownoFeatures ( char *wrd, struct stack_t *pila) - * esta funcion creara las "features" para la palabra desconocida - * y las apilara en en el parametro - */ -void swindow::winPushUnknownFeatures(char *wrd, struct stack_t *pila) -{ - int startcap=0,allup=0,alllow=0,wordlength=0,containnum=0,multiword=0,containcap=0,containcaps=0,containperiod=0; - - //Obtenemos la longitud de la palabra - int len = strlen(wrd); - char ant[10]=""; - - //Creamos el prefijo de longitud 2 - char *feat = new char[6]; - if (len > 1) sprintf(ant,"%c%c",wrd[0],wrd[1]); - else sprintf(ant,"%c~",wrd[0]); - sprintf(feat,"a2:%s",ant); - push(pila,feat); - - //Generamos el prefijo de longitud 3 - feat = new char[7]; - if (len > 2) sprintf(ant,"%c%c%c",wrd[0],wrd[1],wrd[2]); - else sprintf(ant,"%s~",ant); - sprintf(feat,"a3:%s",ant); - push(pila,feat); - - //Generamos el prefijo de longitud 4 - feat = new char[8]; - if (len > 3) sprintf(ant,"%c%c%c%c",wrd[0],wrd[1],wrd[2],wrd[3]); - else sprintf(ant,"%s~",ant); - sprintf(feat,"a4:%s",ant); - push(pila,feat); - - //Generamos el sufijo de longitud 2 - feat = new char[6]; - if (len > 1) sprintf(feat,"z2:%c%c",wrd[len-2],wrd[len-1]); - else sprintf(feat,"z2:~%c","",wrd[len-1]); - push(pila,feat); - - //generamos el sufijo de longitud 3 - feat = new char[7]; - if (len > 2) sprintf(feat,"z3:%c%c%c",wrd[len-3],wrd[len-2],wrd[len-1]); - else if (len > 1) sprintf(feat,"z3:~%c%c",wrd[len-2],wrd[len-1]); - else sprintf(feat,"z3:~~%c",wrd[len-1]); - push(pila,feat); - - //generamos el sufijo de longitud 4 - feat = new char[8]; - //strcpy(prefix4,substr(wrd, 0, 4)); - if (len > 3) sprintf(feat,"z4:%c%c%c%c",wrd[len-4],wrd[len-3],wrd[len-2],wrd[len-1]); - else if (len > 2) sprintf(feat,"z4:~%c%c%c",wrd[len-3],wrd[len-2],wrd[len-1]); - else if (len > 1) sprintf(feat,"z4:~~%c%c",wrd[len-2],wrd[len-1]); - else sprintf(feat,"z4:~~~%c",wrd[len-1]); - push(pila,feat); - - //Comienza por Mayuscula? - if (erLookRegExp2(&erStartCap,wrd)) startcap = 1; - feat = new char[4]; - sprintf(feat,"A:%d",startcap); - push(pila,feat); - - //Esta toda la palabra en mayusculas? - if (erLookRegExp2(&erAllUp,wrd)) allup = 1; - feat = new char[5]; - sprintf(feat,"AA:%d",allup); - push(pila,feat); - - //Esta toda la palabra en minusculas? - if (erLookRegExp2(&erAllLow,wrd)) alllow = 1; - feat = new char[5]; - sprintf(feat,"aa:%d",alllow); - push(pila,feat); - - //Longitud de la palabra - feat = new char[6]; - sprintf(feat,"L:%d",len); - push(pila,feat); - - if (erLookRegExp2(&erContainCap,wrd)) containcap = 1; - feat = new char[5]; - sprintf(feat,"CA:%d",containcap); - push(pila,feat); - - if (erLookRegExp2(&erContainCaps,wrd)) containcaps = 1; - feat = new char[6]; - sprintf(feat,"CAA:%d",containcaps); - push(pila,feat); - - //Contiene un punto? - if (erLookRegExp2(&erContainPeriod,wrd)) containperiod = 1; - feat = new char[5]; - sprintf(feat,"CP:%d",containperiod); - push(pila,feat); - - //Contiene un numero? - if (erLookRegExp2(&erContainNum,wrd)) containnum = 1; - feat = new char[5]; - sprintf(feat,"CN:%d",containnum); - push(pila,feat); - - //Es una palabra multiple? - if (erLookRegExp2(&erMultiWord,wrd)) multiword = 1; - feat = new char[5]; - sprintf(feat,"MW:%d",multiword); - push(pila,feat); - - //Letra por la que empieza la palabra - feat = new char[5]; - sprintf(feat,"c1:%c",wrd[0]); - push(pila,feat); - - //Letra por la que acaba la palabra - feat = new char[5]; - //charn = wrd[len-1]; //substr(wrd, len-1, 1); - sprintf(feat,"cn:%c",wrd[len-1]); - push(pila,feat); -} - - -/* - * void winPushSwnFeature (struct stack_t *pila) - * Recibe como parametro , donde se apilara la "feature" - * Swn.Swn es el elemento final de frase que puede ser - * ! ? o . - */ -void swindow::winPushSwnFeature(struct stack_t *pila) -{ - char *feature = new char[10]; - sprintf(feature,"Swn:%s",last->wrd); - push(pila,feature); -} - - -/* - * void winPushAmbiguityFeature(void *ptr, dictionary *d, stack_t *pila, int direction) - * Genera el atributo que representa la ambiguedad de una palabra. - * Recibe como parametros: - * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) - * aunque se recibe como un void*. - * d, es el diccionario con el que estamos trabajarando - * pila,es la pila donde apilaremos el atributo generado - * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT - * o RIGHT_TO_LEFT). - */ -void swindow::winPushAmbiguityFeature(void *ptr,dictionary *d,struct stack_t *pila,int direction) -{ - char value[100],txt[5]; - nodo_feature_list *p = (nodo_feature_list *)ptr; - nodo *pn; - simpleList *list; - int w,*num,ret=0; - infoDict *pInfoDict; - - strcpy(value,""); - - char *feature = new char[100]; - strcpy(feature,""); - - num = (int *) p->l.getIndex(); - sprintf(value,"%s%d:",p->mark,*num); - pn = get(*num, direction); - if (pn!=NULL) - { - - w = d->getElement(pn->wrd); - if (w!=HASH_FAIL) - { - list = (simpleList *) d->getElementMaybe(w); - int numMaybe = d->getElementNumMaybe(w); - while (ret>=0) - { - pInfoDict = (infoDict *) list->getIndex(); - numMaybe--; - if (numMaybe>0) sprintf(value,"%s%s~",value,pInfoDict->txt); - else sprintf(value,"%s%s",value,pInfoDict->txt); - ret=list->next(); - } - list->setFirst(); - } - //is unknown word - else sprintf(value,"%s%s",value,"UNKNOWN"); - } - else sprintf(value,"%s%s",value,EMPTY_POS); - - strcpy(feature,value); - push (pila,feature); -} - - -/* - * void winPushMFTFeature(void *ptr, dictionary *d, stack_t *pila, int direction) - * Genera el atributo con la "Most Frequent Tag", la etiqueta mas frecuente. - * Recibe como parametros: - * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) - * aunque se recibe como un void*. - * d, es el diccionario con el que estamos trabajarando - * pila,es la pila donde apilaremos el atributo generado - * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT - * o RIGHT_TO_LEFT). - */ -void swindow::winPushMFTFeature(void *ptr,dictionary *d,struct stack_t *pila,int direction) -{ - char value[100],mft[5]; - nodo_feature_list *p = (nodo_feature_list *)ptr; - nodo *pn; - simpleList *list; - int w,*num,max=0,ret=0; - infoDict *pInfoDict; - - strcpy(value,""); - //strcpy(feature,""); - - num = (int *) p->l.getIndex(); - sprintf(value,"%s%d:",p->mark,*num); - pn = get(*num, direction); - if (pn!=NULL) - { - w = d->getElement(pn->wrd); - if (w!=HASH_FAIL) - { - list = (simpleList *) d->getElementMaybe(w); - int numMaybe = d->getElementNumMaybe(w); - while (ret>=0) - { - pInfoDict = (infoDict *) list->getIndex(); - numMaybe--; - if (pInfoDict->num>max) strcpy(mft,pInfoDict->txt); - ret=list->next(); - } - list->setFirst(); - sprintf(value,"%s%s",value,mft); - } - //is unknown word - else sprintf(value,"%s%s",value,"UNKNOWN"); - } - else sprintf(value,"%s%s",value,EMPTY_POS); - char *feature = new char[strlen(value)+1]; - strcpy(feature,value); - push (pila,feature); -} - - -/* - * void winPushMaybeFeature(void *ptr, dictionary *d, stack_t *pila, int direction) - * Genera tantos atributos "maybe" como posibles POS pueda tener la palabra, y los - * apila en . - * Recibe como parametros: - * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) - * aunque se recibe como un void*. - * d, es el diccionario con el que estamos trabajarando - * pila,es la pila donde apilaremos el atributo generado - * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT - * o RIGHT_TO_LEFT). - */ -void swindow::winPushMaybeFeature(void *ptr,dictionary *d,struct stack_t *pila,int direction) -{ - char value[100],txt[5]; - nodo_feature_list *p = (nodo_feature_list *)ptr; - nodo *pn; - simpleList *list; - int w,*num,ret=0; - infoDict *pInfoDict; - char *feature; - - strcpy(value,""); - num = (int *) p->l.getIndex(); - sprintf(txt,"%s%d~",p->mark,*num); - pn = get(*num, direction); - if (pn!=NULL) - { - w = d->getElement(pn->wrd); - - if (w!=HASH_FAIL) - { - list = (simpleList *) d->getElementMaybe(w); - - while (ret>=0) - { - feature = new char[10]; - strcpy(feature,""); - pInfoDict = (infoDict *) list->getIndex(); - sprintf(feature,"%s%s:1",txt,pInfoDict->txt); - push(pila,feature); - ret=list->next(); - } - list->setFirst(); - } - else - { - feature = new char[15]; - //is unknown word - sprintf(feature,"%s%s:1",txt,"UNKNOWN"); - push(pila,feature); - } - } - else - { - feature = new char[10]; - sprintf(feature,"%s%s:1",txt,EMPTY_POS); - push(pila,feature); - } -} - - -/* - * void winPushPosFeature(void *ptr, dictionary *d, stack_t *pila, int direction) - * Genera un atributo con la POS de algunos elementos de la ventana. - * Recibe como parametros: - * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) - * aunque se recibe como un void*. - * d, es el diccionario con el que estamos trabajarando - * pila,es la pila donde apilaremos el atributo generado - * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT - * o RIGHT_TO_LEFT). - */ -void swindow::winPushPosFeature(void *ptr,dictionary *d, struct stack_t *pila,int direction) -{ - char value[100]="",name[100]="",txt[100]=""; - nodo_feature_list *p = (nodo_feature_list *)ptr; - nodo *pn; - infoDict *pInfoDict; - char *feature; - - int end=1,ret=1,w,*num; - - while (end>=0) - { - ret=1; - num = (int *) p->l.getIndex(); - //AKI3 - if (strcmp(name,EMPTY)==0) sprintf(name,"%s%d",p->mark,*num); - else sprintf(name,"%s,%d",name,*num); - pn = get(*num, direction); - - if (pn==NULL) strcpy(txt,EMPTY_POS); - //AKI3 - else if ( (strcmp(pn->pos,EMPTY)==0) || (*num==0) ) - { - - w = d->getElement(pn->wrd); - - if (w!=HASH_FAIL) - { - simpleList *list = (simpleList *) d->getElementMaybe(w); - int numMaybe = d->getElementNumMaybe(w); - - strcpy(txt,EMPTY); - while ( ret>=0 ) - { - pInfoDict = (infoDict *) list->getIndex(); - numMaybe--; - if (numMaybe>0) sprintf(txt,"%s%s_",txt,pInfoDict->txt); - else sprintf(txt,"%s%s",txt,pInfoDict->txt); - ret=list->next(); - } - list->setFirst(); - } - //is unknown word - else strcpy(txt,"UNKNOWN"); - } - else strcpy(txt,pn->pos);//AKI3 - - //AKI3 - if (strcmp(value,EMPTY)==0) sprintf(value,"%s",txt); - else sprintf(value,"%s~%s",value,txt); - - end = p->l.next(); - } - p->l.setFirst(); - sprintf(name,"%s:%s",name,value); - - feature = new char[strlen(name)+2]; - strcpy (feature,name); - //fprintf(stderr,"%s\n",feature); - push (pila,feature); -} - - -/* - * void winPushPOSFeature(void *ptr, dictionary *d, stack_t *pila, int direction) - * Genera un atributo con la palabra de algunos elementos de la ventana. - * Recibe como parametros: - * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) - * aunque se recibe como un void*. - * d, es el diccionario con el que estamos trabajarando - * pila,es la pila donde apilaremos el atributo generado - * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT - * o RIGHT_TO_LEFT). - */ -void swindow::winPushWordFeature(void *ptr,dictionary *d, struct stack_t *pila,int direction) -{ - char value[200],name[200],txt[100]; - nodo_feature_list *p = (nodo_feature_list *)ptr; - nodo *pn=NULL; - char *feature; - - int *num = (int *) p->l.getIndex(); - pn = get(*num, direction); - - if (pn==NULL) strcpy(value,EMPTY_WORD); - else strcpy(value,pn->wrd); - sprintf(name,"%s%d",p->mark,*num); - - while (p->l.next()>=0) - { - num = (int *) p->l.getIndex(); - sprintf(name,"%s,%d",name,*num); - pn = get(*num, direction); - - if (pn==NULL) strcpy(txt,EMPTY_WORD); - else strcpy(txt,pn->wrd); - sprintf(value,"%s~%s",value,txt); - } - p->l.setFirst(); - sprintf(name,"%s%s%s",name,":",value); - - feature = new char[strlen(name)+2]; - strcpy (feature,name); - push(pila,feature); -} - - -/****************************************************************************/ - -int swindow::sentenceLength() -{ - //Retorna el nmero de palabras que tiene la frase cargada en este objeto - return this->numObj; -} - - -/* - * void deleteList() - * Elimina todas las palabras existentes en la ventana - * Retorna el nmero de elementos que poseia la ventana - */ -void swindow::deleteList() -{ - if (first==NULL) return; - - while (first->next!=NULL) - { - first = first->next; - delete first->previous->stackScores; - delete first->previous; - } - - if ( last != NULL ) - { - delete last->stackScores; - delete last; - } - - first=NULL; - last=NULL; - index=NULL; - - return; -} - - -void swindow::init() -{ - iniGeneric(); -} - - -int swindow::iniGeneric() -{ - index = NULL; - beginWin = NULL; - endWin = NULL; - first = NULL; - last = NULL; - numObj = 0; - posBegin = posIndex; - posEnd = posIndex; - - //String para contener frase - memset(lSentence,0,sizeof(lSentence)); - iLSentence = 0; - - int ret = iniList(); - endWin = last; - if (ret>0) readSentence(); - - if (ret==-1) return -1; - else if (ret==0) posEnd = posIndex+last->ord; - else posEnd=posIndex+ret; - - beginWin = first; - - return ret; -} - - -int swindow::iniList() -{ - int j=0,ret=1; - - for(j=posIndex; ((j0)); j++) ret = readInput(); - - //ret >1 correct - // 0 if end of sentence - // -1 if there aren't words - // -2 if end of file - if (ret>0) ret=j-posIndex-1; - - return ret; -} - - -/****************************************************************************/ - -int swindow::readSentence() -{ - int ret=1; - while (ret>0) ret = readInput(); - return ret; -} - - -/****************************************************************************/ - -/* - * Read one line from corpus and add node to list - * Return 1 if it's ok - * 0 if end of sentence - * -1 if there aren't more words - * -2 if end of file - */ -int swindow::readInput() -{ - if (feof(input)) return -2; - - char value[2][100] ={EMPTY,EMPTY}; - char line[250] = EMPTY; - int is_comment = FALSE; - int is_empty_line = FALSE; - int ret = 0; - - fgets(line,250,input); - if ( line[0] == '#' && line[1] == '#' ) is_comment = TRUE; - - ret = sscanf(line,"%s %s\n",value[0],value[1]); - - if ( ret < 0 ) - { - if ( line[0] == '\n' ) is_empty_line = TRUE; - else return -2; - } - - if ( strlen(value[0]) > 0 && is_empty_line == FALSE && is_comment == FALSE ) - winAdd(value[0],value[1]); - - //fprintf(stderr,"\t%s %s \n%d %s\n",value[0],value[1],iLSentence,lSentence); - - strcpy(lSentence[iLSentence],line); - iLSentence++; - - //fprintf (stderr,"%s",lSentence[iLSentence-1]); - if ( iLSentence >= MAX_SENTENCE_LENGTH ) - { - //fprintf (stderr,"-------->"); - //return 0; //Si se supera el tamao mximo de frase se devuelve 0 - fprintf(stderr,"\nFound a sentence with more than %d words!!\n",MAX_SENTENCE_LENGTH); - exit(1); - } - - if ((strcmp(".",value[0])==0) || (strcmp("?",value[0])==0) || (strcmp("!",value[0])==0)) - { - //fprintf (stderr,"-------->"); - return 0; - } - - return 1; -} - - -/****************************************************************************/ - -/* Read one line from corpus and add node to list - * Return 1 if it's ok - * 0 if end of sentence - * -1 if there aren't more words - * -2 if end of file - */ -int swindow::readInput_old() -{ - if (feof(input)) return -2; - - char value[2][100]={EMPTY,EMPTY}; - int i=0,w=0,ret=1,isCom=0,addComAtEnd=0; - char ant='q',c = fgetc(input); - - while ((!feof(input)) && (c!='\n')) - { - if (i<2 && ant=='#' && c=='#') - { - char garbage[512]; - fgets(garbage,512,input); - w=0; - ret = 1; - i=0; - ant='q'; - strcpy(value[0],EMPTY); - strcpy(value[1],EMPTY); - c = fgetc(input); - } - if ((w==0) && (c==' ' || c=='\t' || c==32)) - { - - i=0; - ret = 1; - w=1; - ant='q'; - c = fgetc(input); - } - sprintf(value[w],"%s%c",value[w],c); - - i++; - ant=c; - c = fgetc(input); - - } - value[w][i]='\0'; - - if ((strlen(value[0])<=0) && (!isCom)) return -1; - - winAdd(value[0],value[1]); - - if ((strcmp(".",value[0])==0) || (strcmp("?",value[0])==0) || (strcmp("!",value[0])==0)) return 0; - return 1; -} - - -/****************************************************************************/ - -int swindow::winAdd(char *wrd, char *com) -{ - nodo *aux = new nodo; - if(numObj == 0) - { - aux->previous=NULL; - first = aux; - last = aux; - index = aux; - } - else - { - aux->previous = last; - last->next = aux; - last = aux; - } - aux->ord = numObj; - int erRet=erLookRegExp(wrd); - switch (erRet) - { - case CARD: strcpy(aux->wrd,"@CARD"); break; - case CARDSEPS: strcpy(aux->wrd,"@CARDSEPS"); break; - case CARDPUNCT: strcpy(aux->wrd,"@CARDPUNCT"); break; - case CARDSUFFIX: strcpy(aux->wrd,"@CARDSUFFIX"); break; - default: strcpy(aux->wrd,wrd); - } - strcpy(aux->realWrd,wrd); - strcpy(aux->posOld,EMPTY); - strcpy(aux->pos,EMPTY); - strcpy(aux->comment,com); - aux->stackScores = new stack_t; - init_stack(aux->stackScores); - aux->weight = 0; - aux->weightOld = 0; - aux->next=NULL; - numObj++; - return numObj; -} - - -/****************************************************************************/ - -swindow::~swindow() -{ - deleteList(); -} - - -swindow::swindow(FILE *in) -{ - input=in; - lengthWin = 7; - posIndex = 3; - - init(); -} - - -swindow::swindow(FILE *in,int number, int position) -{ - input=in; - - if ((number<3) || (number<=position)) - { - fprintf(stderr,"\nWindow Length can not be first or last element.\nLength should be greater than \"Interest Point Position\" or 3.\n"); - exit(0); - } - - lengthWin = number; - posIndex = position-1; - - init(); -} - - -swindow::swindow(FILE *in,int number) -{ - input=in; - lengthWin = number; - posIndex = number/2; - - init(); -} - - -/****************************************************************************/ - -/* Move Interest Point to next element */ -int swindow::next() -{ - int ret = -1; - if ((ret==-1) && (endWin->next!=NULL)) ret=1; - - if ((index==NULL) || (index->next==NULL)) return -1; - if ((posIndex>=posEnd) && (ret==-1)) return -1; - - if ((posIndexnext; - - if (posBegin==0) beginWin = beginWin->next; - else if ((posIndex>=posBegin) && (posBegin>0)) posBegin--; - - index = index->next; - return 0; -} - - -/****************************************************************************/ - -/* Move Interest Point to previous element */ -int swindow::previous() -{ - if ((index==NULL) || (index->previous==NULL)) return -1; - - if ((posBegin==0) && (beginWin->previous!=NULL)) beginWin = beginWin->previous; - else if (posIndex>posBegin) posBegin++; - - if (posEndprevious; - - index = index->previous; - return 0; -} - - -/****************************************************************************/ - -/* Get Interest Point */ -nodo *swindow::getIndex() -{ - return index; -} - - -/****************************************************************************/ - -nodo *swindow::get(int position,int direction) -{ - nodo *aux=NULL; - int i=0; - - if (position == 0) return index; - if (direction==2) position = -position; - if ( (numObj == 0) - || ((position<0) && (posIndex+position+10) && (posIndex+position>posEnd)) ) - return NULL; - - aux = index; - - while (i!=position) - { - if (position>0) - { - i++; - if (aux->next != NULL) aux = aux->next; - else return NULL; - } - else - { - i--; - if (aux->previous != NULL) aux = aux->previous; - else return NULL; - } - } - - return aux; -} - - -/****************************************************************************/ - -int swindow::show() -{ - int i = 0; - char wrd[TAM_WORD]; - - if (first==NULL) return 0; - - nodo *tmp = first; - nodo *actual = first; - - memset(wrd,0,sizeof(wrd)); - sscanf(lSentence[i],"%s",wrd); - if (strcmp(wrd,actual->realWrd)==0) - { - printf("%s %s %s\n",actual->realWrd,actual->pos,actual->comment); - } - else printf(lSentence[i]); - i++; - - while (actual->next!=NULL) - { - tmp=actual->next; - memset(wrd,0,sizeof(wrd)); - sscanf(lSentence[i],"%s",wrd); - if (strcmp(wrd,tmp->realWrd)==0) - { - printf("%s %s %s\n",tmp->realWrd,tmp->pos,tmp->comment); - actual = tmp; - } - else printf(lSentence[i]); - i++; - //printf("%s %s %s\n",actual->realWrd,actual->pos,actual->comment); - } - return 0; -} - - -/****************************************************************************/ - -void swindow::putLengthWin(int l) -{ - lengthWin = l; -} - - -/****************************************************************************/ - -void swindow::putIndex(int i) -{ - posIndex = i; -} - - -/****************************************************************************/ - -/* - * Modifica el valor de los pesos para una palabra - * Si: - * action = 0 --> Pone el peso mximo (put max score) - * action = 1 --> Inicializa los pesos (reset values) - * action = 2 --> Restaura el valor de la vuelta anterior(last lap value) - */ -int swindow::winMaterializePOSValues(int action) -{ - if (first==NULL) return 0; - - int inicio=1; - weight_node_t *w,max; - nodo *actual=first; - - while (actual!=NULL) - { - - switch (action) - { - case 0: //PUT MAX - inicio = 1; - while(!empty(actual->stackScores)) - { - w = (weight_node_t *) pop(actual->stackScores); - - if (inicio || w->data>max.data) - { - max.data=w->data; - strcpy(max.pos,w->pos); - inicio = 0; - } - delete w; - } - actual->weight=max.data; - strcpy(actual->pos,max.pos); - //Added for 2 laps tagging - actual->weightOld=max.data; - strcpy(actual->posOld,max.pos); - break; - case 1: //RESET VALUES - strcpy(actual->pos,""); - actual->weight=0; - break; - case 2: //PUT OLD - strcpy(actual->pos,actual->posOld); - actual->weight=actual->weightOld; - break; - } - actual=actual->next; - } - return 0; -} - - -/****************************************************************************/ - -/* - * int winExistUnkWord(int direction, dictionary *d) - * Esta funcion comprueba si hay parabras desconocidas. - * En caso de que el parametro direction sea: - * LEFT_TO_RIGHT - mira si hay desconocidas a la - * derecha del punto de interes de la ventana. - * RIGHT_TO_LEFT - mira si hay desconocidas a la izquierda - * del punto de interes de la ventana. - * Esta funcion devuelve: - * un entero >=0, si no hay desconocidas - * -1, si hay desconocidas - */ -int swindow::winExistUnkWord(int direction, dictionary *d) -{ - nodo *aux=index; - int ret=0,i=posIndex; - - if (index==NULL) return 1; - aux = index; - - while (ret>=0) - { - switch (direction) - { - case LEFT_TO_RIGHT: - if (aux->next==NULL || aux==endWin) ret=-1; - else aux = aux->next; - if (d->getElement(aux->wrd)==HASH_FAIL) return -1; - i++; - break; - case RIGHT_TO_LEFT: - if (aux->previous==NULL || aux==beginWin) ret=-1; - else aux = aux->previous; - if (d->getElement(aux->wrd)==HASH_FAIL) return -1; - i--; - break; - } - } - return 0; -} +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "swindow.h" + +#include "hash.h" +#include "nodo.h" +#include "list.h" +#include "dict.h" +#include "weight.h" +#include "er.h" +#include "common.h" +#include "marks.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_SENTENCE_LENGTH 1000 +#define MAX_LINE_LENGTH 4096 + +/***************************************************************** + * Feature Generation + *****************************************************************/ + +void swindow::winPushStartWithLowerFeature(const std::string& wrd, std::stack&pila) +{ + int startlower=0; + + //Comienza por Minuscula? + if (erLookRegExp2(&erStartLower,wrd)) + { + startlower = 1; + + std::ostringstream feat; + feat << START_LOWER_MARK<<":"<&pila) +{ + int startnumber=0; + + //Comienza por Numero? + if (erLookRegExp2(&erStartNumber,wrd)) + { + startnumber = 1; + + //mod Correcting dynamic memory errors + std::ostringstream feat; + feat << START_NUMBER_MARK<<":" << startnumber; + + pila.push(feat.str()); + } +} + +void swindow::winPushSuffixFeature(const std::string& wrd, std::stack& pila,int longitud) +{ + //Obtenemos la longitud de la palabra + + std::ostringstream feat; + int len = wrd.size(); + std::ostringstream suf; + //int a=0; + + for (int i=len-longitud; i<=len-1; i++) + { + if (i>=0) suf< y las apilara en en el parametro + */ +void swindow::winPushPrefixFeature(const std::string& wrd, std::stack& pila, int longitud) +{ + //Obtenemos la longitud de la palabra + std::ostringstream feat; + int len = wrd.size(); + std::ostringstream pref; + + for (int i=0; i i) pref< len-1 )*/ pref<<"~"; + } + + feat<&pila) +{ + int startcap=0; + + //Comienza por Mayuscula? + if (erLookRegExp2(&erStartCap,wrd)) + { + startcap = 1; + //mod Correcting dynamic memory errors + //std::string feat = new char[strlen(START_CAPITAL_MARK)+4]; + std::ostringstream feat; + + feat << START_CAPITAL_MARK<<":"<& pila) +{ + int allup=0; + + //Esta toda la palabra en mayusculas? + if (erLookRegExp2(&erAllUp,wrd)) + { + allup = 1; + + //mod Correcting dynamic memory errors + //std::string feat = new char[4]; + std::ostringstream feat; + + feat << ALL_UPPER_MARK<<":"<& pila) +{ + int alllow = 0; + //Esta toda la palabra en minusculas? + if (erLookRegExp2(&erAllLow,wrd)) + { + alllow = 1; + + //mod Correcting dynamic memory errors + //std::string feat = new char[4]; + std::ostringstream feat; + + feat << ALL_LOWER_MARK<<":"<& pila) +{ + int containcap = 0; + if (erLookRegExp2(&erContainCap,wrd)) + { + containcap = 1; + + //mod Correcting dynamic memory errors + //std::string feat = new char[4]; + std::ostringstream feat; + + feat << CONTAIN_CAP_MARK<<":"<& pila) +{ + int containcaps = 0; + if (erLookRegExp2(&erContainCaps,wrd)) + { + containcaps = 1; + + //mod Correcting dynamic memory errors + //std::string feat; + std::ostringstream feat; + + feat << CONTAIN_CAPS_MARK <<":"<& pila) +{ + int containperiod = 0; + //Contiene un punto? + if (erLookRegExp2(&erContainPeriod,wrd)) + { + containperiod = 1; + std::stringstream feat; + feat << CONTAIN_PERIOD_MARK<<":"<& pila) +{ + int containcomma = 0; + //Contiene un punto? + if (erLookRegExp2(&erContainComma,wrd)) + { + containcomma = 1; + std::ostringstream feat; + feat << CONTAIN_COMMA_MARK<<":"<& pila) +{ + int containnum = 0; + //Contiene un numero? + if (erLookRegExp2(&erContainNum,wrd)) + { + containnum = 1; + + std::ostringstream feat; + //mod + //sprintf(feat,"CN:%d",containnum); + feat << CONTAIN_NUMBER_MARK<<":"<& pila) +{ + int multiword = 0; + //Es una palabra multiple? + if (erLookRegExp2(&erMultiWord,wrd)) + { + multiword = 1; + + //mod Correcting dynamic memory errors + //std::string feat = new char[6]; + //sprintf(feat,"MW:%d",multiword); + std::ostringstream feat; + feat << MULTIWORD_MARK<<":"<&pila, int where, int position) +{ + std::ostringstream feature; + + if (COUNTING_FROM_END==where) + { + feature<& pila) +{ + //Obtenemos la longitud de la palabra + int len = wrd.size(); + + //Longitud de la palabra + //mod Correcting dynamic memory errors + //std::string feat = new char[4]; + std::ostringstream feat; + + feat <, donde se apilara la "feature" + * Swn.Swn es el elemento final de frase que puede ser + * ! ? o . + */ +void swindow::winPushSwnFeature(std::stack& pila) +{ + std::string feature = "Swn:"; + feature += "."; + pila.push(feature); +} + + +/* + * void winPushAmbiguityFeature(void *ptr, dictionary *d, std::stack *pila, int direction) + * Genera el atributo que representa la ambiguedad de una palabra. + * Recibe como parametros: + * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) + * aunque se recibe como un void*. + * d, es el diccionario con el que estamos trabajarando + * pila,es la pila donde apilaremos el atributo generado + * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT + * o RIGHT_TO_LEFT). + */ +void swindow::winPushAmbiguityFeature(void* ptr, dictionary* d, std::stack& pila, int direction) +{ + std::ostringstream value; + nodo_feature_list *p = (nodo_feature_list *)ptr; + nodo *pn; + int *num; + infoDict *pInfoDict; + + + num = *p->l.getIndex(); + value << p->mark << *num << ":"; + pn = get(*num, direction); + if (pn!=NULL) + { + dataDict* w = d->getElement(pn->wrd); + if ((long)w!=HASH_FAIL) + { + simpleList& list = d->getElementMaybe(w); + int numMaybe = d->getElementNumMaybe(w); + bool ret = true; + while (ret) + { + pInfoDict = *list.getIndex(); + numMaybe--; + if (numMaybe>0) value << pInfoDict->pos << "~"; + else value << pInfoDict->pos; + ret=list.next(); + } + list.setFirst(); + } + else value << "UNKNOWN"; //is unknown word + } + else value << EMPTY_POS; + + pila.push(value.str()); +} + + +/* + * void winPushMFTFeature(void *ptr, dictionary *d, std::stack *pila, int direction) + * Genera el atributo con la "Most Frequent Tag", la etiqueta mas frecuente. + * Recibe como parametros: + * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) + * aunque se recibe como un void*. + * d, es el diccionario con el que estamos trabajarando + * pila,es la pila donde apilaremos el atributo generado + * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT + * o RIGHT_TO_LEFT). + */ +void swindow::winPushMFTFeature(void* ptr, dictionary* d, std::stack& pila, int direction) +{ + std::string value; + std::string mft; + nodo_feature_list *p = (nodo_feature_list *)ptr; + nodo *pn; + int *num,max=0; + infoDict *pInfoDict; + + num = *p->l.getIndex(); + value = p->mark; + value += *num + ":"; + pn = get(*num, direction); + if (pn!=NULL) + { + dataDict* w = d->getElement(pn->wrd); + if ((long)w!=HASH_FAIL) + { + simpleList& list = d->getElementMaybe(w); + int numMaybe = d->getElementNumMaybe(w); + bool ret = true; + while (ret) + { + pInfoDict = *list.getIndex(); + numMaybe--; + if (pInfoDict->num>max) mft = pInfoDict->pos; + ret=list.next(); + } + list.setFirst(); + value += mft; + } + else value += "UNKNOWN"; //is unknown word + } + else value += EMPTY_POS; + + pila.push(value); +} + + +/* + * void winPushMaybeFeature(void *ptr, dictionary *d, std::stack *pila, int direction) + * Genera tantos atributos "maybe" como posibles POS pueda tener la palabra, y los + * apila en . + * Recibe como parametros: + * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) + * aunque se recibe como un void*. + * d, es el diccionario con el que estamos trabajarando + * pila,es la pila donde apilaremos el atributo generado + * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT + * o RIGHT_TO_LEFT). + */ +void swindow::winPushMaybeFeature(void* ptr, dictionary* d, std::stack& pila, int direction) +{ + std::string value; + std::ostringstream txt; + nodo_feature_list *p = (nodo_feature_list *)ptr; + nodo *pn; + int *num; + infoDict *pInfoDict; + + num = *p->l.getIndex(); + txt << p->mark << *num << "~"; + pn = get(*num, direction); + if (pn!=NULL) + { + dataDict* w = d->getElement(pn->wrd); + + if ((long)w!=HASH_FAIL) + { + simpleList& list = d->getElementMaybe(w); + bool ret = true; + while (ret) + { + std::string feature; + pInfoDict = *list.getIndex(); + feature += txt.str() + pInfoDict->pos + ":1"; + pila.push(feature); + ret=list.next(); + } + list.setFirst(); + } + else + { + std::string feature; + feature += txt.str() + "UNKNOWN:1"; + pila.push(feature); + } + } + else + { + std::string feature; + feature += txt.str() + EMPTY_POS + ":1"; + pila.push(feature); + } +} + + +/* + * void winPushPosFeature(void *ptr, dictionary *d, std::stack *pila, int direction) + * Genera un atributo con la POS de algunos elementos de la ventana. + * Recibe como parametros: + * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) + * aunque se recibe como un void*. + * d, es el diccionario con el que estamos trabajarando + * pila,es la pila donde apilaremos el atributo generado + * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT + * o RIGHT_TO_LEFT). + */ +void swindow::winPushPosFeature(void* ptr, dictionary* d, std::stack& pila, int direction) +{ + std::string value; + std::string txt; + nodo_feature_list *p = (nodo_feature_list *)ptr; + nodo *pn; + infoDict *pInfoDict; + std::string feature; + + int *num; + + bool end = false; + while (!end) + { + num = *p->l.getIndex(); + + pn = get(*num, direction); + + if (pn==NULL) txt = EMPTY_POS; + else if ( (pn->pos == EMPTY) || (*num==0) ) //AKI3 + { + + dataDict* w = d->getElement(pn->wrd); + + if ((long)w!=HASH_FAIL) + { + simpleList& list = d->getElementMaybe(w); + int numMaybe = d->getElementNumMaybe(w); + + txt = EMPTY; + bool ret = true; + while ( ret ) + { + pInfoDict = *list.getIndex(); + numMaybe--; + if (numMaybe>0) txt += pInfoDict->pos + "_"; + else txt += pInfoDict->pos; + ret=list.next(); + } + list.setFirst(); + } + else txt = "UNKNOWN"; //is unknown word + } + else txt = pn->pos; //AKI3 + + if (value.empty()) value = txt; //AKI3 + else value += "~" + txt; + + end = !p->l.next(); + } + p->l.setFirst(); + + feature = ":" + value; + //std::cerr << feature << std::endl; + pila.push(feature); +} + +/* + * void winPushPOSFeature(void *ptr, dictionary *d, std::stack *pila, int direction) + * Genera un atributo con la palabra de algunos elementos de la ventana. + * Recibe como parametros: + * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list) + * aunque se recibe como un void*. + * d, es el diccionario con el que estamos trabajarando + * pila,es la pila donde apilaremos el atributo generado + * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT + * o RIGHT_TO_LEFT). + */ +void swindow::winPushWordFeature(void* ptr, dictionary* /*d*/, std::stack& pila, int direction) +{ + std::string value; + std::ostringstream name; + std::string txt; + nodo_feature_list *p = (nodo_feature_list *)ptr; + nodo *pn=NULL; + + int *num = *p->l.getIndex(); + pn = get(*num, direction); + + if (pn==NULL) value = EMPTY_WORD; + else value = pn->wrd; + name << std::string(p->mark) << *num; + + while (p->l.next()) + { + num = *p->l.getIndex(); + name << "," << *num; + + pn = get(*num, direction); + + if (pn==NULL) txt = EMPTY_WORD; + else txt = pn->wrd; + value += "~" + txt; + } + p->l.setFirst(); + name << ":" << value; + + pila.push(name.str()); +} + + +/****************************************************************************/ + + +int swindow::sentenceLength() +{ + //Retorna el número de palabras que tiene la frase cargada en este objeto + return this->numObj; +} + +/* + * void deleteList() + * Elimina todas las palabras existentes en la ventana + * Retorna el número de elementos que poseia la ventana + */ +void swindow::deleteList() +{ +// std::cerr << "swindow::deleteList " << numObj << " elements" << std::endl; + if (first==0) return; + + int i = 0; + while (first->next!=0) + { + i++; +// std::cerr << "swindow::deleteList delete " << i << "th element: " << &(first->strScores) << " " << first->strScores << std::endl; + assert(first->next->previous == first); + first = first->next; + delete first->previous; + } + i++; + if ( last != 0 ) + { +// std::cerr << "swindow::deleteList delete " << i << "th element" << std::endl; + delete last; + } + + first=0; + last=0; + index=0; + numObj=0; +} + + +void swindow::init(dictionary* dic) +{ + if(m_output != NULL) + iniGeneric(dic); +} + +int swindow::iniGeneric(dictionary* dic) +{ + posBegin = posIndex; + posEnd = posIndex; + + int ret = iniList(dic); + endWin = last; + if (ret>=0) readSentence(dic); + + if (ret==-1) return -1; + else if (ret==0) posEnd = posIndex+last->ord; + else posEnd=posIndex+ret; + + beginWin = first; + + return ret; +} + +int swindow::iniList(dictionary* dic) +{ + int j=0,ret=1; + + for(j=posIndex; ((j=0)); j++) ret = readInput(dic); + + //ret >1 correct + // 0 if end of sentence + // -1 if there aren't words + // -2 if end of file + if (ret>=0) ret=j-posIndex-1; + + return ret; +} + +void swindow::setWindow(const std::vector& user_window) { + this->user_window = user_window; +} + + +/****************************************************************************/ + +int swindow::readSentence(dictionary* dic) +{ + int ret=1; + while (ret>=0) ret = readInput(dic); + return ret; +} + + +int swindow::readInput(dictionary* dic) { + std::string word, comment; + std::set tagset; + + int ret; + while((ret = m_reader.parseWord(word, tagset, comment)) == 1); + nodo* node = m_reader.buildNode(word, comment); + + winAdd(node); + if(!tagset.empty()) dic->addBackupEntry(word, tagset); + + return ret; +} + + +/****************************************************************************/ + +void swindow::winAdd(nodo *aux) +{ + // add the node in the software window + if(numObj == 0) + { + first = aux; + last = aux; + index = aux; + } + else + { + aux->previous = last; + last->next = aux; + last = aux; + } + + numObj++; +} + + +/****************************************************************************/ + +swindow::~swindow() +{ +// deleteList(); +} + +swindow::swindow(istream& in, std::ostream* output, dictionary* dic) : m_output(output), m_reader(in), +first(0), last(0), numObj(0),index(0),beginWin(0),endWin(0),posBegin(0),posEnd(0) +{ + lengthWin = 7; + posIndex = 3; + + init(dic); +} + +swindow::swindow(int lengthWin, dictionary *dic): m_output(NULL), m_reader(), +first(0), last(0), numObj(0), beginWin(0), endWin(0), posBegin(0), posEnd(lengthWin) +{ + this->lengthWin = lengthWin; + posIndex = 2; + init(dic); +} + +swindow::swindow(istream& in, int number, int position, std::ostream* output, dictionary* dic) : m_output(output), m_reader(in), +first(0), last(0), numObj(0),index(0),beginWin(0),endWin(0),posBegin(0),posEnd(0) +{ + + if ((number<3) || (number<=position)) + { fprintf(stderr,"\nWindow Length can not be first or last element.\nLength should be greater than \"Interest Point Position\" or 3.\n"); + exit(0); + } + + lengthWin = number; + posIndex = position-1; + + init(dic); +} + +swindow::swindow(istream& in, int number, std::ostream* output, dictionary* dic) : m_output(output), m_reader(in), +first(0), last(0), numObj(0),index(0),beginWin(0),endWin(0),posBegin(0),posEnd(0) +{ + + lengthWin = number; + posIndex = number/2; + + init(dic); +} + +/****************************************************************************/ + +/* Move Interest Point to next element */ +bool swindow::next() +{ + bool ret = false; + if (endWin->next!=0) ret=true; + + if ((index==0) || (index->next==0)) return false; + if ((posIndex>=posEnd) && (!ret)) return false; + + if ((posIndexnext; + + if (posBegin==0) beginWin = beginWin->next; + else if ((posIndex>=posBegin) && (posBegin>0)) posBegin--; + + index = index->next; + return true; +} + +/****************************************************************************/ + +/* Move Interest Point to previous element */ +bool swindow::previous() +{ + if ((index==NULL) || (index->previous==NULL)) return false; + + if ((posBegin==0) && (beginWin->previous!=NULL)) beginWin = beginWin->previous; + else if (posIndex>posBegin) posBegin++; + + if (posEndprevious; + + index = index->previous; + return true; +} + +/****************************************************************************/ + +/* Get Interest Point */ +nodo *swindow::getIndex() +{ + return index; +} + +/****************************************************************************/ + +nodo *swindow::get(int position, int direction) +{ + if (direction==2) position = -position; + if ( ((position<0) && (posIndex+position+10) && (posIndex+position>posEnd)) ) + return NULL; + + if(!user_window.empty()) + return get_user(position); + else + return get_intern(position); +} + +// position must be valid, ie. "in the window" +nodo *swindow::get_user(int position) +{ + return user_window[position+2]; +} + +nodo *swindow::get_intern(int position) +{ + nodo *aux=index; + int i=0; + + if (position == 0) return index; + + while (i!=position) + { + if (position>0) + { i++; + if (aux->next != NULL) aux = aux->next; + else return NULL; + } + else + { i--; + if (aux->previous != NULL) aux = aux->previous; + else return NULL; + } + } + + return aux; +} + + +/****************************************************************************/ + +int swindow::show(int showScoresFlag, int showComments) +{ + std::string wrd; + + if (first==NULL) return 0; + + nodo *actual = first; + + while(actual != NULL) { + *m_output << actual->realWrd << " " << actual->pos; + + if ( showScoresFlag == TRUE && !actual->strScores.empty() ) { + *m_output << " " << actual->strScores; + } + + if ( showComments == TRUE && !actual->comment.empty() ) { + *m_output << " " << actual->comment; + } + + *m_output << std::endl; + + actual = actual->next; + } + + return 0; +} + +/****************************************************************************/ + +void swindow::putLengthWin(int l) +{ + lengthWin = l; +} + +/****************************************************************************/ + +void swindow::putIndex(int i) +{ + posIndex = i; +} + +/****************************************************************************/ + +/* + * Modifica el valor de los pesos para una palabra + * Si: + * action = 0 --> Pone el peso máximo (put max score) + * action = 1 --> Inicializa los pesos (reset values) + * action = 2 --> Restaura el valor de la vuelta anterior(last lap value) + */ +int swindow::winMaterializePOSValues(int action) +{ + if (first==NULL) return 0; + + int inicio=1; + weight_node_t *w,max; + nodo *actual=first; + + while (actual!=NULL) + { + + switch (action) + { + case 0: //PUT MAX + inicio = 1; + while(!actual->stackScores.empty()) + { + w = actual->stackScores.top(); + actual->stackScores.pop(); + + if (inicio || w->data>max.data) + { + max.data=w->data; + max.pos = w->pos; + inicio = 0; + } + delete w; + } + actual->weight=max.data; + actual->pos = max.pos; + //Added for 2 laps tagging + actual->weightOld=max.data; + actual->posOld = max.pos; + break; + case 1: //RESET VALUES + actual->pos = ""; + actual->weight=0; + break; + case 2: //PUT OLD + actual->pos = actual->posOld; + actual->weight=actual->weightOld; + break; + } + actual=actual->next; + } + return 0; +} + +/****************************************************************************/ + +/* + * int winExistUnkWord(int direction, dictionary *d) + * Esta funcion comprueba si hay parabras desconocidas. + * En caso de que el parametro direction sea: + * LEFT_TO_RIGHT - mira si hay desconocidas a la + * derecha del punto de interes de la ventana. + * RIGHT_TO_LEFT - mira si hay desconocidas a la izquierda + * del punto de interes de la ventana. + * Esta funcion devuelve: + * un entero >=0, si no hay desconocidas + * -1, si hay desconocidas + */ +int swindow::winExistUnkWord(int direction, dictionary *d) +{ + nodo *aux=index; + int ret=0,i=posIndex; + + if (index==NULL) return 1; + aux = index; + + while (ret>=0) + { + switch (direction) + { + case LEFT_TO_RIGHT: + if (aux->next==NULL || aux==endWin) ret=-1; + else aux = aux->next; + if ((long)d->getElement(aux->wrd)==HASH_FAIL) return -1; + i++; + break; + case RIGHT_TO_LEFT: + if (aux->previous==NULL || aux==beginWin) ret=-1; + else aux = aux->previous; + if ((long)d->getElement(aux->wrd)==HASH_FAIL) return -1; + i--; + break; + } + } + return 0; +} + diff --git a/src/tagger.cc b/src/tagger.cc old mode 100644 new mode 100755 dissimilarity index 80% index 30810a4..74d16da --- a/src/tagger.cc +++ b/src/tagger.cc @@ -1,849 +1,870 @@ -/* - * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include "marks.h" -#include "strategies.h" -#include "hash.h" -#include "list.h" -#include "weight.h" -#include "dict.h" -#include "stack.h" -#include "swindow.h" -#include "tagger.h" -#include "common.h" - -/***************************************************************/ - -struct tms tbuffStartUp,tbuffEndStartUp; -clock_t startUpTime,endStartUpTime; -double sysFexTime=0, usrFexTime=0,realFexTime=0; -double sysSVMTime=0, usrSVMTime=0,realSVMTime=0; - -/***************************************************************/ - -extern int verbose_svmtool; -int NUM_UNK_POS=0; - -/***************************************************************/ - -hash_t *tagger::taggerCreateBiasHash(char *name) -{ - hash_t *bias = new hash_t; - int i=0; - char c=' ',weight[20]; - weight_node_t *w; - FILE *f; - - hash_init(bias,40); - - if ((f = fopen(name, "rt"))== NULL) - { - fprintf(stderr, "Error opening file: %s",name); - exit(0); - } - - while (!feof(f)) - { - c = fgetc(f); - if (c!='#') - { - w = new weight_node_t; - strcpy(weight,""); - i=0; - strcpy(w->pos,""); - while ((c!='\n') && (!feof(f))) - { - if (c!=' ' && c!='\n' && c!='\t' && i==1) sprintf(weight,"%s%c",weight,c); - else if (c!=' ' && c!='\n' && c!='\t' && i==0) - { - if (c!=':') sprintf(w->pos,"%s%c",w->pos,c); - else i=1; - } - c = fgetc(f); - } - w->data = (long double)0; - w->data = atof (weight); - hash_insert(bias,w->pos,(uintptr_t)w); - } //end if - else while(c=fgetc(f)!='\n'); - } - fclose(f); - return bias; -} - - -/***************************************************************/ - -tagger::tagger(char *model) -{ - char name[150]=""; - strcpy(flow,"LR"); - taggerNumLaps = 1; - taggerKFilter = 0; - taggerUFilter = 0; - //modstrat //0; - taggerStrategy = STRA_1P_DEFAULT; - taggerWinIndex = -1; - taggerWinLength = -1; - strcpy(taggerModelName,model); - strcpy (taggerBackupDict,""); - stk = new stack_t; - init_stack(stk); -} - - -/***************************************************************/ - -void tagger::taggerLoadModels(models_t *model, int taggerNumModel) -{ - char name[150],flow2[5],flow1[5]; - - //Cargamos la lista de "features" para palabras conocidas - sprintf(name,"%s.A%d",taggerModelName,taggerNumModel); - if (verbose_svmtool) fprintf(stderr,"\nLoading FEATURES FOR KNOWN WORDS from < %s >\n",name); - createFeatureList(name,&model->featureList); - //Cargamos la lista de "features" para palabras desconocidas - sprintf(name,"%s.A%d.UNK",taggerModelName,taggerNumModel); - if (verbose_svmtool) fprintf(stderr,"\nLoading FEATURES FOR UNKNOWN WORDS from < %s >\n",name); - createFeatureList(name,&model->featureListUnk); - - if (strcmp(flow,"LRL")==0) - { - strcpy(flow1,"LR"); strcpy(flow2,"RL"); - - sprintf(name,"%s (Right-to-Left)",flow1); - if (verbose_svmtool) fprintf(stderr,"\nREADING MODELS < direction = %s >\n",name); - - sprintf(name,"%s.M%d.%s.MRG",taggerModelName,taggerNumModel,flow2); - if (verbose_svmtool) fprintf(stderr,"-. Loading MERGED MODEL FOR KNOWN WORDS from < %s >\n",name); - model->wr2 = new weightRepository(name,taggerKFilter); - - sprintf(name,"%s.UNK.M%d.%s.MRG",taggerModelName,taggerNumModel,flow2); - if (verbose_svmtool) fprintf(stderr,"-. Loading MERGED MODEL FOR UNKKNOWN WORDS from < %s >\n\n",name); - model->wrUnk2 = new weightRepository(name,taggerUFilter); - } - else strcpy(flow1,flow); - - if (strcmp(flow1,"RL")==0) sprintf(name,"%s (Right-to-Left)",flow1); - else sprintf(name,"%s (Left-to-Right)",flow1); - - if (verbose_svmtool) fprintf(stderr,"\nREADING MODELS < direction = %s >\n",name); - - sprintf(name,"%s.M%d.%s.MRG",taggerModelName,taggerNumModel,flow1); - if (verbose_svmtool) fprintf(stderr,"-. Loading MERGED MODEL FOR KNOWN WORDS from < %s >\n",name); - model->wr = new weightRepository(name,taggerKFilter); - - sprintf(name,"%s.UNK.M%d.%s.MRG",taggerModelName,taggerNumModel,flow1); - if (verbose_svmtool) fprintf(stderr,"-. Loading MERGED MODEL FOR UNKNOWN WORDS from < %s >\n",name); - model->wrUnk = new weightRepository(name,taggerUFilter); - -} - - -/***************************************************************/ - -void tagger::taggerLoadModelsForTagging() -{ - startUpTime = times(&tbuffStartUp); - - int modelsNeeded=1; - char name[150]; - - sprintf(name,"%s.DICT",taggerModelName); - if (strcmp(taggerBackupDict,"")!=0) - { - if (verbose_svmtool) fprintf(stderr,"Loading DICTIONARY from < %s > with BACKUP DICTIONARY from < %s >\n",name,taggerBackupDict); - d = new dictionary(name,taggerBackupDict); - } - else - { - if (verbose_svmtool) fprintf(stderr,"Loading DICTIONARY from < %s >\n",name); - d = new dictionary(name); - } - - sprintf(name,"%s.UNKP",taggerModelName); - if (verbose_svmtool) fprintf(stderr,"Loading UNKNOWN WORDS POS from < %s >\n",name); - weightUnk = taggerCreateWeightUnkArray(name); - - //modstrat 1 - if ( taggerStrategy == STRA_2P_RELABELING - || taggerStrategy == STRA_1P_ROBUST_UNK /*modstrat 4*/ ) modelsNeeded = 2; - - taggerModelList = new models_t[modelsNeeded]; - taggerModelRunning = &taggerModelList[0]; - - //modstrat 0) - if (taggerStrategy == STRA_1P_DEFAULT ) - taggerLoadModels(taggerModelRunning,0); - //modstrat 2) - else if (taggerStrategy == STRA_1P_UNSUPERVISED ) - taggerLoadModels(taggerModelRunning,3); - //modstrat 4) - else if (taggerStrategy == STRA_1P_ROBUST_UNK ) - { - taggerLoadModels(taggerModelRunning,0); - taggerLoadModels(&taggerModelList[1],2); - } - //modstrat 5) - else if (taggerStrategy == STRA_1P_VERY_ROBUST_UNK ) - taggerLoadModels(taggerModelRunning,4); - //modstrat 1) - else if (taggerStrategy == STRA_2P_RELABELING ) - { - taggerLoadModels(taggerModelRunning,2); - taggerLoadModels(&taggerModelList[1],1); - taggerNumLaps = 2; - } - else - { - fprintf(stderr,"Execution error: Strategy %d doesn't exist!!\n\n",taggerStrategy); - exit(0); - } - - endStartUpTime = times(&tbuffEndStartUp); -} - - -void tagger::taggerInit() -{ - - // int modelsNeeded=1; - char name[150]; - - //Mirar si existe fichero .WIN - if (taggerWinIndex==-1 && taggerWinLength==-1) - { - sprintf(name,"%s.WIN",taggerModelName); - FILE *f = fopen (name,"r"); - if ( f == NULL ) sw = new swindow(stdin); - else - { - fscanf(f,"%d %d",&taggerWinLength,&taggerWinIndex); - fclose(f); - sw = new swindow (stdin,taggerWinLength,taggerWinIndex); - } - } - else if (taggerWinIndex==-1) sw = new swindow (stdin,taggerWinLength); - else sw = new swindow (stdin,taggerWinLength,taggerWinIndex); -} - - -/***************************************************************/ - -tagger::~tagger() -{ - int modelsNeeded=1; - - //modstrat 1) - if (taggerStrategy == STRA_2P_RELABELING ) - modelsNeeded = 2; - - delete stk; - delete d; - delete sw; - delete[] weightUnk; //Mod - - for (int i=0;iprevious()==0); - nodo *elem = sw->getIndex(); - - if (sw->winExistUnkWord(1,d)==-1) - taggerModelRunning=&taggerModelList[1]; - else taggerModelRunning=&taggerModelList[0]; - - taggerGenerateScore(elem,1); - - while(sw->next()==0) - { - elem = sw->getIndex(); - - if (sw->winExistUnkWord(1,d)==-1) - taggerModelRunning=&taggerModelList[1]; - else taggerModelRunning=&taggerModelList[0]; - - taggerGenerateScore(elem,1); - cont++; - } - - if (strcmp(flow,"LRL")==0) sw->winMaterializePOSValues(1); - - return cont; -} - - -/***************************************************************/ - -int tagger::taggerLeftSenseSpecialForUnknown() -{ - int cont=1; - while(sw->next()==0); - nodo *elem = sw->getIndex(); - if (sw->winExistUnkWord(2,d)==-1) - taggerModelRunning=&taggerModelList[1]; - else taggerModelRunning=&taggerModelList[0]; - - taggerGenerateScore(elem,2); - - while(sw->previous()==0) - { - elem = sw->getIndex(); - - if (sw->winExistUnkWord(2,d)==-1) - taggerModelRunning=&taggerModelList[1]; - else taggerModelRunning=&taggerModelList[0]; - - taggerGenerateScore(elem,2); - cont++; - } - - if (strcmp(flow,"LRL")==0) sw->winMaterializePOSValues(0); - return cont; -} - - -/***************************************************************/ - -int tagger::taggerRightSense() -{ - int cont=1; - - while(sw->previous()==0); - nodo *elem = sw->getIndex(); - taggerGenerateScore(elem,1); - - while(sw->next()==0) - { - elem = sw->getIndex(); - taggerGenerateScore(elem,1); - cont++; - } - - if (strcmp(flow,"LRL")==0) sw->winMaterializePOSValues(1); - - return cont; -} - - -/***************************************************************/ - -int tagger::taggerLeftSense() -{ - int cont=1; - while(sw->next()==0); - nodo *elem = sw->getIndex(); - taggerGenerateScore(elem,2); - - while(sw->previous()==0) - { - elem = sw->getIndex(); - taggerGenerateScore(elem,2); - cont++; - } - - if (strcmp(flow,"LRL")==0) sw->winMaterializePOSValues(0); - return cont; -} - - -/***************************************************************/ - -void tagger::taggerRun() -{ - int contWords=0,contSentences=0; - - struct tms tbuff1,tbuff2; - clock_t start,end; - start = times(&tbuff1); - - switch(taggerStrategy) - { - case STRA_1P_DEFAULT/*modstrat 0*/: taggerDoNormal(&contWords,&contSentences); break; - case STRA_2P_RELABELING/*modstrat 1*/: taggerDoNTimes(&contWords,&contSentences,taggerNumLaps); break; - case STRA_1P_UNSUPERVISED/*modstrat 2*/: taggerDoNormal(&contWords,&contSentences); break; - case STRA_1P_SENTENCE_LEVEL/*modstrat 3*/: /*taggerDoNTimes(&contWords,&contSentences,taggerNumLaps);*/ break; - case STRA_1P_ROBUST_UNK/*modstrat 4*/: taggerDoSpecialForUnknown(&contWords,&contSentences); break; - case STRA_1P_VERY_ROBUST_UNK/*modstrat 5*/: taggerDoNormal(&contWords,&contSentences); break; - case STRA_1P_ROBUST_SENTENCE_LEVEL: break; - } - end = times(&tbuff2); - - if (verbose_svmtool) - { - taggerShowVerbose(contSentences,1); - - fprintf(stderr,"* -------------------------------------------------------------------\n"); - showTime("Start Up Time", - ((double)(endStartUpTime-startUpTime))/CLOCKS_PER_SECOND, - ((double)tbuffEndStartUp.tms_utime-(double)tbuffStartUp.tms_utime)/CLOCKS_PER_SECOND, - ((double)tbuffEndStartUp.tms_stime-(double)tbuffStartUp.tms_stime)/CLOCKS_PER_SECOND); - fprintf(stderr,"* -------------------------------------------------------------------\n"); - showTime("Features Extraction Time",realFexTime,usrFexTime,sysFexTime); - showTime("SVM Time",realSVMTime,usrSVMTime,sysSVMTime); - showTime("Process Time",((double)(end-start))/CLOCKS_PER_SECOND - realFexTime - realSVMTime, - ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime)/CLOCKS_PER_SECOND - usrFexTime -usrSVMTime, - ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime)/CLOCKS_PER_SECOND - sysFexTime -sysSVMTime); - fprintf(stderr,"* -------------------------------------------------------------------\n"); - fprintf(stderr,"[ Tagging Time = Feature Extraction Time + SVM Time + Process Time ]\n"); - showTime("Tagging Time",((double)(end-start))/CLOCKS_PER_SECOND, - ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime)/CLOCKS_PER_SECOND, - ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime)/CLOCKS_PER_SECOND); - fprintf(stderr,"* -------------------------------------------------------------------\n"); - fprintf(stderr,"[ Overall Time = Start up Time + Tagging Time ]\n"); - showTime("Overall Time",((double)(end-start+endStartUpTime-startUpTime))/CLOCKS_PER_SECOND, - ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime+ - (double)tbuffEndStartUp.tms_utime-(double)tbuffStartUp.tms_utime)/CLOCKS_PER_SECOND, - ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime+ - (double)tbuffEndStartUp.tms_stime-(double)tbuffStartUp.tms_stime)/CLOCKS_PER_SECOND); - fprintf(stderr,"* -------------------------------------------------------------------\n"); - taggerStadistics(contWords,contSentences, - ((double)(end-start))/CLOCKS_PER_SECOND, - ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime)/CLOCKS_PER_SECOND, - ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime)/CLOCKS_PER_SECOND); - } -} - - -/***************************************************************/ - -void tagger::taggerDoNormal(int *numWords, int *numSentences) -{ - int contWordsLR=0,contWordsRL=0,contSentences=0,ret = 1; - - while ((ret>=0)) - { - if (verbose_svmtool) taggerShowVerbose(contSentences,0); - - if ((strcmp(flow,"LRL")==0) || (strcmp(flow,"LR")==0)) - contWordsLR = contWordsLR+taggerRightSense(); - if ((strcmp(flow,"LRL")==0) || (strcmp(flow,"RL")==0)) - contWordsRL = contWordsRL+taggerLeftSense(); - contSentences++; - sw->show(); - sw->deleteList(); - ret = sw->iniGeneric(); - } - if (contWordsRL==0) *numWords=contWordsLR/taggerNumLaps; - else *numWords=contWordsRL/taggerNumLaps; - *numSentences = contSentences; -} - - -/***************************************************************/ - -void tagger::taggerDoSpecialForUnknown(int *numWords, int *numSentences) -{ - int contWordsLR=0,contWordsRL=0,contSentences=0,ret = 1; - - while ((ret>=0)) - { - if (verbose_svmtool) taggerShowVerbose(contSentences,0); - - if ((strcmp(flow,"LRL")==0) || (strcmp(flow,"LR")==0)) - contWordsLR = contWordsLR+taggerRightSenseSpecialForUnknown(); - if ((strcmp(flow,"LRL")==0) || (strcmp(flow,"RL")==0)) - contWordsRL = contWordsRL+taggerLeftSenseSpecialForUnknown(); - - contSentences++; - sw->show(); - sw->deleteList(); - ret = sw->iniGeneric(); - } - if (contWordsRL==0) *numWords=contWordsLR/taggerNumLaps; - else *numWords=contWordsRL/taggerNumLaps; - *numSentences = contSentences; -} - - -/***************************************************************/ - -void tagger::taggerDoNTimes(int *numWords, int *numSentences,int laps) -{ - int contWordsLR=0,contWordsRL=0,contSentences=0,ret = 1; - - while ((ret>=0)) - { - - if (verbose_svmtool) taggerShowVerbose(contSentences,0); - - for (int pasadas=0;pasadas0) - sw->winMaterializePOSValues(2); - if ((strcmp(flow,"LRL")==0) || (strcmp(flow,"RL")==0)) - contWordsRL = contWordsRL+taggerLeftSense(); - - } - - contSentences++; - sw->show(); - sw->deleteList(); - ret = sw->iniGeneric(); - } - if (contWordsRL==0) *numWords=contWordsLR/taggerNumLaps; - else *numWords=contWordsRL/taggerNumLaps; - *numSentences = contSentences; -} - - -/***************************************************************/ -/***************************************************************/ - -void tagger::taggerGenerateScore(nodo *elem,int direction) -{ - - struct tms tbuffStartFex,tbuffEndFex; - clock_t startFexTime,endFexTime; - struct tms tbuffStartSVM,tbuffEndSVM; - clock_t startSVMTime,endSVMTime; - - weight_node_t *weight; - nodo_feature_list *aux; - weightRepository *weightRep; - hash_t *bias; - int i,numMaybe,ret=1,max=0; - int is_unk=FALSE; - simpleList *featureList; - - startFexTime = times(&tbuffStartFex); - - i = d->getElement(elem->wrd); - if (i!=HASH_FAIL) - { - featureList = &taggerModelRunning->featureList; - numMaybe = d->getElementNumMaybe(i); - weight = taggerCreateWeightNodeArray(numMaybe,i); - if ((strcmp(flow,"LRL")==0) && (direction==2)) - { - //wr2; - weightRep = taggerModelRunning->wr2; - //bias = taggerModelRunning->bias2; //taggerBias2; - } - else - { - //wr; - weightRep = taggerModelRunning->wr; - //bias = taggerModelRunning->bias; //taggerBias; - } - } - else - { - numMaybe = NUM_UNK_POS; - weight = taggerInitializeWeightNodeArray(numMaybe,weightUnk); - featureList = &taggerModelRunning->featureListUnk; - is_unk = TRUE; - - if ((strcmp(flow,"LRL")==0) && (direction==2)) - { - //wrUnk2; - weightRep = taggerModelRunning->wrUnk2; - //bias = taggerModelRunning->biasUnk2; //taggerBiasUnk2; - } - else - { //wrUnk; - weightRep = taggerModelRunning->wrUnk; - //bias =taggerModelRunning->biasUnk; //taggerBiasUnk; - } - } - - if (numMaybe>1) - { - while (ret>=0) - { - aux = (nodo_feature_list *) featureList->getIndex(); - if (strcmp(aux->mark,SLASTW)==0) sw->winPushSwnFeature(stk); - else if (strcmp(aux->mark,WMARK)==0) sw->winPushWordFeature((void *)aux,d,stk,direction); - else if (strcmp(aux->mark,KMARK)==0) sw->winPushAmbiguityFeature((void *)aux,d,stk,direction); - else if (strcmp(aux->mark,MMARK)==0) sw->winPushMaybeFeature((void *)aux,d,stk,direction); - else if (strcmp(aux->mark,PMARK)==0) sw->winPushPosFeature((void *)aux,d,stk,direction); - else if (strcmp(aux->mark,MFTMARK)==0) sw->winPushMFTFeature((void *)aux,d,stk,direction); - else if (is_unk==TRUE) - { - int *param; - if (aux->n>0) - { - param = (int *) aux->l.getIndex(); - } - if (strcmp(aux->mark,PREFIX_MARK)==0) sw->winPushPrefixFeature(elem->wrd, stk, *param); - else if (strcmp(aux->mark,SUFFIX_MARK)==0) sw->winPushSuffixFeature(elem->wrd, stk, *param); - else if (strcmp(aux->mark,CHAR_A_MARK)==0) sw->winPushLetterFeature(elem->wrd, stk, *param, COUNTING_FROM_BEGIN); - else if (strcmp(aux->mark,CHAR_Z_MARK)==0) sw->winPushLetterFeature(elem->wrd, stk, *param, COUNTING_FROM_END); - else if (strcmp(aux->mark,LENGTH_MARK)==0) sw->winPushLenghtFeature(elem->wrd,stk); - else if (strcmp(aux->mark,START_CAPITAL_MARK)==0) sw->winPushStartWithCapFeature(elem->wrd,stk); - else if (strcmp(aux->mark,START_LOWER_MARK)==0) sw->winPushStartWithLowerFeature(elem->wrd,stk); - else if (strcmp(aux->mark,START_NUMBER_MARK)==0) sw->winPushStartWithNumberFeature(elem->wrd,stk); - else if (strcmp(aux->mark,ALL_UPPER_MARK)==0) sw->winPushAllUpFeature(elem->wrd,stk); - else if (strcmp(aux->mark,ALL_LOWER_MARK)==0) sw->winPushAllLowFeature(elem->wrd,stk); - else if (strcmp(aux->mark,CONTAIN_CAP_MARK)==0) sw->winPushContainCapFeature(elem->wrd, stk); - else if (strcmp(aux->mark,CONTAIN_CAPS_MARK)==0) sw->winPushContainCapsFeature(elem->wrd, stk); - else if (strcmp(aux->mark,CONTAIN_COMMA_MARK)==0) sw->winPushContainCommaFeature(elem->wrd, stk); - else if (strcmp(aux->mark,CONTAIN_NUMBER_MARK)==0) sw->winPushContainNumFeature(elem->wrd, stk); - else if (strcmp(aux->mark,CONTAIN_PERIOD_MARK)==0) sw->winPushContainPeriodFeature(elem->wrd, stk); - else if (strcmp(aux->mark,MULTIWORD_MARK)==0) sw->winPushMultiwordFeature(elem->wrd, stk); - } - ret = featureList->next(); - } - featureList->setFirst(); - - endFexTime = times(&tbuffEndFex); - realFexTime = realFexTime + ((double)(endFexTime-startFexTime))/CLOCKS_PER_SECOND; - usrFexTime = usrFexTime + (((double)tbuffEndFex.tms_utime-(double)tbuffStartFex.tms_utime)/CLOCKS_PER_SECOND); - sysFexTime = sysFexTime + (((double)tbuffEndFex.tms_stime-(double)tbuffStartFex.tms_stime)/CLOCKS_PER_SECOND); - - startSVMTime = times(&tbuffStartSVM); - - taggerSumWeight(weightRep,bias,weight,numMaybe,&max); - - endSVMTime = times(&tbuffEndSVM); - realSVMTime = realSVMTime + ((double)(endSVMTime-startSVMTime))/CLOCKS_PER_SECOND; - usrSVMTime = usrSVMTime + (((double)tbuffEndSVM.tms_utime-(double)tbuffStartSVM.tms_utime)/CLOCKS_PER_SECOND); - sysSVMTime = sysSVMTime + (((double)tbuffEndSVM.tms_stime-(double)tbuffStartSVM.tms_stime)/CLOCKS_PER_SECOND); - } - - strcpy(elem->pos,weight[max].pos); - elem->weight = weight[max].data; - - if (strcmp(flow,"LRL")==0) - { - weight_node_t *score = new weight_node_t; - score->data = weight[max].data; - strcpy(score->pos,weight[max].pos); - push(elem->stackScores,score); - } - - //mod delete[] instead of delete - if (i!=HASH_FAIL) delete[] weight; -} - - -/***************************************************************/ - -weight_node_t *tagger::taggerCreateWeightNodeArray(int numMaybe,int index) -{ - int ret=1,j = numMaybe; - weight_node_t *weight = new weight_node_t[numMaybe]; - simpleList *list = (simpleList *) d->getElementMaybe(index); - - while (ret>=0 && numMaybe > 0) - { - infoDict *pInfoDict = (infoDict *) list->getIndex(); - j--; - sprintf(weight[j].pos,"%s",pInfoDict->txt); - weight[j].data = 0; - ret=list->next(); - } - - list->setFirst(); - return weight; -} - - -/***************************************************************/ - -weight_node_t *tagger::taggerInitializeWeightNodeArray(int numMaybe,weight_node_t *w) -{ - for (int i=0;iwrGetWeight("BIASES",weight[j].pos); - weight[j].data = weight[j].data - b; - } - w = wRep->wrGetWeight(feature,weight[j].pos); - weight[j].data=weight[j].data+w; - if (((float)weight[*max].data)<((float)weight[j].data)) *max=j; - } - delete[] feature; //mod delete[] instead of delete - putBias=0; - } -} - - -/***************************************************************/ -/***************************************************************/ - -weight_node_t *tagger::taggerCreateWeightUnkArray(char *name) -{ - NUM_UNK_POS=0; - int i=0; - char c=' '; - FILE *f; - - if ((f = fopen(name, "rt"))== NULL) - { - fprintf(stderr, "Error opening file: %s",name); - exit(0); - } - - while (!feof(f)) - { - if (fgetc(f)=='\n') NUM_UNK_POS++; - } - - fseek(f,0,SEEK_SET); - - weight_node_t *weight = new weight_node_t[NUM_UNK_POS]; - while (!feof(f) && (i +#include +#include + + +/***************************************************************/ + +struct tms tbuffStartUp,tbuffEndStartUp; +clock_t startUpTime,endStartUpTime; +double sysFexTime=0, usrFexTime=0,realFexTime=0; +double sysSVMTime=0, usrSVMTime=0,realSVMTime=0; + +/***************************************************************/ + +extern int verbose; + + +/***************************************************************/ + +hash_t *tagger::taggerCreateBiasHash(const std::string& name) +{ + hash_t *bias = new hash_t; + int i=0; + char c=' '; + weight_node_t *w; + FILE *f; + + bias->hash_init(40); + + if ((f = fopen(name.c_str(), "rt"))== NULL) + { + std::cerr << "Error opening file: "<pos = ""; + while ((c!='\n') && (!feof(f))) + { + if (c!=' ' && c!='\n' && c!='\t' && i==1) weight.push_back(c); + else if (c!=' ' && c!='\n' && c!='\t' && i==0) + { + if (c!=':') w->pos += c; + else i=1; + } + c = fgetc(f); + } + w->data = (long double)0; + std::istringstream iss(weight); + iss >> w->data; + bias->hash_insert(w->pos,w); + } //end if + else while((c=fgetc(f))!='\n'); + } + fclose(f); + return bias; +} + +/***************************************************************/ + +tagger::tagger(const std::string& model) : stk(), sw(0) +{ + std::string name; + flow = "LR"; + taggerShowScoresFlag = false; + taggerShowCommentsFlag = true; + taggerNumLaps = 1; + taggerKFilter = 0; + taggerUFilter = 0; + taggerStrategy = STRA_1P_DEFAULT; //modstrat //0; + taggerWinIndex = -1; + taggerWinLength = -1; + taggerModelName = model; + taggerBackupDict =""; +} + +/***************************************************************/ + +void tagger::taggerLoadModels(models_t *model, int taggerNumModel) +{ + std::string flow2,flow1; + std::ostringstream name; + + //Cargamos la lista de "features" para palabras conocidas + name << taggerModelName << ".A" << taggerNumModel; + if (verbose) std::cerr << std::endl << "Loading FEATURES FOR KNOWN WORDS from < "<"<< std::endl; + createFeatureList(name.str(),&model->featureList); + //Cargamos la lista de "features" para palabras desconocidas + name.str(""); + name << taggerModelName << ".A" << taggerNumModel << ".UNK"; + if (verbose) std::cerr << std::endl << "Loading FEATURES FOR UNKNOWN WORDS from < "<"<< std::endl; + createFeatureList(name.str(),&model->featureListUnk); + + if (flow == "LRL") + { + flow1 = "LR"; flow2 = "RL"; + + name.str(""); + name << flow1 << " (Right-to-Left)"; + if (verbose) std::cerr << std::endl <<"READING MODELS < direction = "<"<"<< std::endl; + model->wr2 = new weightRepository(name.str(),taggerKFilter); + + name.str(""); + name << taggerModelName << ".UNK.M"<"<wrUnk2 = new weightRepository(name.str(),taggerUFilter); + } + else flow1 =flow; + + name.str(""); + if (flow1 =="RL") name <"<"<wr = new weightRepository(name.str(),taggerKFilter); + + name.str(""); + name << taggerModelName << ".UNK.M"<"<wrUnk = new weightRepository(name.str(),taggerUFilter); + +} + +/***************************************************************/ + +void tagger::taggerLoadModelsForTagging() +{ + startUpTime = times(&tbuffStartUp); + + int modelsNeeded=1; + + std::string name = taggerModelName + ".DICT"; + if (!taggerBackupDict.empty()) + { + if (verbose) std::cerr << "Loading DICTIONARY from < "< with BACKUP DICTIONARY from < "<"<< std::endl; + d = new dictionary(name,taggerBackupDict); + } + else + { + if (verbose) std::cerr<<"Loading DICTIONARY from < "<"<"<taggerShowScoresFlag = false; +} + +/***************************************************************/ + +void tagger::taggerPutFlow(const std::string& inFlow) +{ + flow = inFlow; +} + +/***************************************************************/ + +void tagger::taggerPutStrategy(int num) +{ + taggerStrategy = num; +} + +/***************************************************************/ + +void tagger::taggerPutWinLength(int l) +{ + taggerWinLength = l; +} + +/***************************************************************/ + +void tagger::taggerPutWinIndex(int i) +{ + taggerWinIndex = i; +} + +/***************************************************************/ + +void tagger::taggerPutBackupDictionary(const std::string& dictName) +{ + taggerBackupDict = dictName; +} + +/***************************************************************/ + +void tagger::taggerPutKWeightFilter(float kfilter) +{ + taggerKFilter = kfilter; +} + +/***************************************************************/ + +void tagger::taggerPutUWeightFilter(float ufilter) +{ + taggerUFilter = ufilter; +} + +/***************************************************************/ +/***************************************************************/ +/***************************************************************/ +/***************************************************************/ + +int tagger::taggerRightSenseSpecialForUnknown() +{ + int cont=1; + + while(sw->previous()); + nodo *elem = sw->getIndex(); + + if (sw->winExistUnkWord(1,d)==-1) + taggerModelRunning=&taggerModelList[1]; + else taggerModelRunning=&taggerModelList[0]; + + taggerGenerateScore(elem,1); + + while(sw->next()) + { + elem = sw->getIndex(); + + if (sw->winExistUnkWord(1,d)==-1) + taggerModelRunning=&taggerModelList[1]; + else taggerModelRunning=&taggerModelList[0]; + + taggerGenerateScore(elem,1); + cont++; + } + + if (flow == "LRL") sw->winMaterializePOSValues(1); + + return cont; +} + +/***************************************************************/ + +int tagger::taggerLeftSenseSpecialForUnknown() +{ + int cont=1; + while(sw->next()); + nodo *elem = sw->getIndex(); + if (sw->winExistUnkWord(2,d)==-1) + taggerModelRunning=&taggerModelList[1]; + else taggerModelRunning=&taggerModelList[0]; + + taggerGenerateScore(elem,2); + + while(sw->previous()) + { + elem = sw->getIndex(); + + if (sw->winExistUnkWord(2,d)==-1) + taggerModelRunning=&taggerModelList[1]; + else taggerModelRunning=&taggerModelList[0]; + + taggerGenerateScore(elem,2); + cont++; + } + + if (flow=="LRL") sw->winMaterializePOSValues(0); + return cont; +} + +/***************************************************************/ + +int tagger::taggerRightSense() +{ + int cont=1; + + while(sw->previous()); + nodo *elem = sw->getIndex(); + if (elem == 0) + { + std::cerr << "tagger::taggerRightSense: ERROR index null at beginning" << std::endl; + return -1; + } + taggerGenerateScore(elem,1); + + while(sw->next()) + { + elem = sw->getIndex(); + taggerGenerateScore(elem,1); + cont++; + } + + if (flow =="LRL") sw->winMaterializePOSValues(1); + + return cont; +} + +/***************************************************************/ + +int tagger::taggerLeftSense() +{ + int cont=1; + while(sw->next()); + nodo *elem = sw->getIndex(); + taggerGenerateScore(elem,2); + + while(sw->previous()) + { + elem = sw->getIndex(); + taggerGenerateScore(elem,2); + cont++; + } + + if (flow =="LRL") sw->winMaterializePOSValues(0); + return cont; +} + +/***************************************************************/ + +void tagger::taggerRun() +{ + int contWords=0,contSentences=0; + + struct tms tbuff1,tbuff2; + clock_t start,end; + start = times(&tbuff1); + + switch(taggerStrategy) + { + case STRA_1P_DEFAULT/*modstrat 0*/: taggerDoNormal(&contWords,&contSentences); break; + case STRA_2P_RELABELING/*modstrat 1*/: taggerDoNTimes(&contWords,&contSentences,taggerNumLaps); break; + case STRA_1P_UNSUPERVISED/*modstrat 2*/: taggerDoNormal(&contWords,&contSentences); break; + case STRA_1P_SENTENCE_LEVEL/*modstrat 3*/: /*taggerDoNTimes(&contWords,&contSentences,taggerNumLaps);*/ break; + case STRA_1P_ROBUST_UNK/*modstrat 4*/: taggerDoSpecialForUnknown(&contWords,&contSentences); break; + case STRA_1P_VERY_ROBUST_UNK/*modstrat 5*/: taggerDoNormal(&contWords,&contSentences); break; + case STRA_1P_ROBUST_SENTENCE_LEVEL: break; + } + end = times(&tbuff2); + + + if (verbose) + { taggerShowVerbose(contSentences,1); + + std::cerr<<"* -------------------------------------------------------------------"<=0)) + { + if (verbose) taggerShowVerbose(contSentences,0); + + if ((flow=="LRL") || (flow =="LR")) + contWordsLR = contWordsLR+taggerRightSense(); + if ((flow =="LRL") || (flow == "RL")) + contWordsRL = contWordsRL+taggerLeftSense(); + contSentences++; + sw->show(taggerShowScoresFlag, taggerShowCommentsFlag); + sw->deleteList(); + ret = sw->iniGeneric(d); + } + if (contWordsRL==0) *numWords=contWordsLR/taggerNumLaps; + else *numWords=contWordsRL/taggerNumLaps; + *numSentences = contSentences; +} + +/***************************************************************/ + +void tagger::taggerDoSpecialForUnknown(int *numWords, int *numSentences) +{ + int contWordsLR=0,contWordsRL=0,contSentences=0,ret = 1; + + while ((ret>=0)) + { + if (verbose) taggerShowVerbose(contSentences,0); + + if ((flow == "LRL") || (flow =="LR")) + contWordsLR = contWordsLR+taggerRightSenseSpecialForUnknown(); + if ((flow == "LRL") || (flow =="RL")) + contWordsRL = contWordsRL+taggerLeftSenseSpecialForUnknown(); + + contSentences++; + sw->show(taggerShowScoresFlag, taggerShowCommentsFlag); + sw->deleteList(); + ret = sw->iniGeneric(d); + } + if (contWordsRL==0) *numWords=contWordsLR/taggerNumLaps; + else *numWords=contWordsRL/taggerNumLaps; + *numSentences = contSentences; +} + +/***************************************************************/ + +void tagger::taggerDoNTimes(int *numWords, int *numSentences,int laps) +{ + int contWordsLR=0,contWordsRL=0,contSentences=0,ret = 1; + + while ((ret>=0)) + { + if (verbose) taggerShowVerbose(contSentences,0); + + for (int pasadas=0;pasadas0) + sw->winMaterializePOSValues(2); + if ((flow == "LRL") || (flow =="RL")) + contWordsRL = contWordsRL+taggerLeftSense(); + } + + contSentences++; + sw->show(taggerShowScoresFlag, taggerShowCommentsFlag); + sw->deleteList(); + ret = sw->iniGeneric(d); + } + if (contWordsRL==0) *numWords=contWordsLR/taggerNumLaps; + else *numWords=contWordsRL/taggerNumLaps; + *numSentences = contSentences; +} + +/***************************************************************/ + +void tagger::taggerGenerateScore(nodo *elem,int direction) +{ + + struct tms tbuffStartFex,tbuffEndFex; + clock_t startFexTime,endFexTime; + struct tms tbuffStartSVM,tbuffEndSVM; + clock_t startSVMTime,endSVMTime; + + weight_node_t *weight; + weightRepository *weightRep; + int numMaybe,max=0; + int is_unk=FALSE; + simpleList *featureList; + + startFexTime = times(&tbuffStartFex); + + dataDict* i = d->getElement(elem->wrd); + if ((long)i!=HASH_FAIL) + { + featureList = &taggerModelRunning->featureList; + numMaybe = d->getElementNumMaybe(i); + weight = taggerCreateWeightNodeArray(numMaybe,i); + if ((flow =="LRL") && (direction==2)) + { + weightRep = taggerModelRunning->wr2; //wr2; + //bias = taggerModelRunning->bias2; //taggerBias2; + } + else + { + weightRep = taggerModelRunning->wr; //wr; + //bias = taggerModelRunning->bias; //taggerBias; + } + } + else + { + featureList = &taggerModelRunning->featureListUnk; + weight = taggerCreateWeightUnkArray(&numMaybe); + is_unk = TRUE; + + if (flow =="LRL" && (direction==2)) + { + weightRep = taggerModelRunning->wrUnk2; //wrUnk2; + //bias = taggerModelRunning->biasUnk2; //taggerBiasUnk2; + } + else + { weightRep = taggerModelRunning->wrUnk; //wrUnk; + //bias =taggerModelRunning->biasUnk; //taggerBiasUnk; + } + } + + + if (numMaybe>1) + { + bool ret = true; + while (ret) + { + nodo_feature_list* aux= *featureList->getIndex(); + if (aux->mark == SLASTW) sw->winPushSwnFeature(stk); + else if (aux->mark == WMARK) sw->winPushWordFeature((void *)aux,d,stk,direction); + else if (aux->mark == KMARK) sw->winPushAmbiguityFeature((void *)aux,d,stk,direction); + else if (aux->mark == MMARK) sw->winPushMaybeFeature((void *)aux,d,stk,direction); + else if (aux->mark == PMARK) sw->winPushPosFeature((void *)aux,d,stk,direction); + else if (aux->mark == MFTMARK) sw->winPushMFTFeature((void *)aux,d,stk,direction); + else if (is_unk==TRUE) + { + int *param; + if (!aux->l.isEmpty()) + { + param = *aux->l.getIndex(); + } + if (aux->mark == PREFIX_MARK) sw->winPushPrefixFeature(elem->wrd, stk, *param); + else if (aux->mark == SUFFIX_MARK) sw->winPushSuffixFeature(elem->wrd, stk, *param); + else if (aux->mark == CHAR_A_MARK) sw->winPushLetterFeature(elem->wrd, stk, COUNTING_FROM_BEGIN, *param); + else if (aux->mark == CHAR_Z_MARK) sw->winPushLetterFeature(elem->wrd, stk, COUNTING_FROM_END, *param); + else if (aux->mark == LENGTH_MARK) sw->winPushLenghtFeature(elem->wrd,stk); + else if (aux->mark == START_CAPITAL_MARK) sw->winPushStartWithCapFeature(elem->wrd,stk); + else if (aux->mark == START_LOWER_MARK) sw->winPushStartWithLowerFeature(elem->wrd,stk); + else if (aux->mark == START_NUMBER_MARK) sw->winPushStartWithNumberFeature(elem->wrd,stk); + else if (aux->mark == ALL_UPPER_MARK) sw->winPushAllUpFeature(elem->wrd,stk); + else if (aux->mark == ALL_LOWER_MARK) sw->winPushAllLowFeature(elem->wrd,stk); + else if (aux->mark == CONTAIN_CAP_MARK) sw->winPushContainCapFeature(elem->wrd, stk); + else if (aux->mark == CONTAIN_CAPS_MARK) sw->winPushContainCapsFeature(elem->wrd, stk); + else if (aux->mark == CONTAIN_COMMA_MARK) sw->winPushContainCommaFeature(elem->wrd, stk); + else if (aux->mark == CONTAIN_NUMBER_MARK) sw->winPushContainNumFeature(elem->wrd, stk); + else if (aux->mark == CONTAIN_PERIOD_MARK) sw->winPushContainPeriodFeature(elem->wrd, stk); + else if (aux->mark == MULTIWORD_MARK) sw->winPushMultiwordFeature(elem->wrd, stk); + } + ret = featureList->next(); + } + featureList->setFirst(); + + endFexTime = times(&tbuffEndFex); + realFexTime = realFexTime + ((double)(endFexTime-startFexTime))/CLOCKS_PER_SECOND; + usrFexTime = usrFexTime + (((double)tbuffEndFex.tms_utime-(double)tbuffStartFex.tms_utime)/CLOCKS_PER_SECOND); + sysFexTime = sysFexTime + (((double)tbuffEndFex.tms_stime-(double)tbuffStartFex.tms_stime)/CLOCKS_PER_SECOND); + + startSVMTime = times(&tbuffStartSVM); + + elem->strScores = taggerSumWeight(weightRep,weight,numMaybe,&max); + //std::cerr << "tagger::taggerGenerateScore got elem strScores: '" << elem->strScores << "'" << std::endl; + + endSVMTime = times(&tbuffEndSVM); + realSVMTime = realSVMTime + ((double)(endSVMTime-startSVMTime))/CLOCKS_PER_SECOND; + usrSVMTime = usrSVMTime + (((double)tbuffEndSVM.tms_utime-(double)tbuffStartSVM.tms_utime)/CLOCKS_PER_SECOND); + sysSVMTime = sysSVMTime + (((double)tbuffEndSVM.tms_stime-(double)tbuffStartSVM.tms_stime)/CLOCKS_PER_SECOND); + } + + elem->pos = weight[max].pos; + elem->weight = weight[max].data; + + if (flow =="LRL") + { + weight_node_t* score = new weight_node_t(); + score->data = weight[max].data; + score->pos = weight[max].pos; + elem->stackScores.push(score); + } + + delete[] weight; +} + + + +/***************************************************************/ + +/* Returns an array ready to be filled with maybe informations */ +weight_node_t *tagger::taggerCreateWeightNodeArray(int numMaybe,dataDict* index) +{ + int j = numMaybe; + weight_node_t *weight = new weight_node_t[numMaybe]; + simpleList *list = &d->getElementMaybe(index); + + bool ret=true; + while (ret && numMaybe > 0) + { + infoDict *pInfoDict = *list->getIndex(); + j--; + weight[j].pos = pInfoDict->pos; + weight[j].data = 0; + ret=list->next(); + } + + list->setFirst(); + return weight; +} + +/***************************************************************/ + +std::string tagger::taggerSumWeight(weightRepository* wRep, weight_node_t* weight, int numMaybe, int* max) +{ +// weight_node_t *aux; + long double w,b = 0; + std::string feature; + int putBias=1; + + while (!stk.empty()) + { + *max=0; + feature = stk.top(); + stk.pop(); + for (int j=0; jwrGetWeight("BIASES",weight[j].pos); + weight[j].data = weight[j].data - b; + } + w = wRep->wrGetWeight(feature,weight[j].pos); + weight[j].data=weight[j].data+w; + if (((float)weight[*max].data)<((float)weight[j].data)) *max=j; + } + putBias=0; + } + std::ostringstream tmp; + if ( true ) + { + for (int i=0; i -#include -#include -#include "hash.h" -#include "weight.h" - -float absolut(float f) -{ - if (f < 0) return (-1)*f; - else return f; -} - - -/***********************************************************/ - -/* - * El objeto WeightRepository es el encargado de contener los pesos - * para cada pareja POS-feature. Un depsito de pesos est formado por - * un hash de objetos weight_struct_t, conteniendo los atributos (key). - * Cada uno de estos hash es poseedor de un nuevo hash con todas las POS - * para las cuales se ha encontrado el atributo y su respectivo peso. - * (weight_node_t). - */ - -/***********************************************************/ - -//Definicin de ewight_struct_t -class weight_struct_t -{ - public: - char key[150]; - hash_t *hash; - - //El destructot de este objeto eliminar el contenido del hash - ~weight_struct_t() - { - weight_node_t *aux; - - hash_t *tptr = this->hash; - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - //Recorremos todas las entradas de la tabla de hash - //Eliminando todos no objetos que encontremos - for (int i=0; inext; - - aux = (weight_node_t *) tmp->data; - - delete aux; - aux = NULL; - } /* while */ - } /* for */ - - hash_destroy(hash); - } -}; - -/***********************************************************/ - -char weightRepository::wrSaltarBlancs(FILE *in, char c,int jmp) -{ - while ((c==':') || (c==' ') || (c=='\n' && jmp==1)) c=fgetc(in); - return c; -} - - -/***********************************************************/ - -/* - * void weightRepository::wrReadMergeModel(FILE *in,float filter) - * Parmetros: - * FILE *in : apuntador al fichero que ha de leer - * float filter: Valor para filtrar los pesos que se lean - * Este mtodo carga un depsito de pesos de un fichero (f), filltrando - * los pesos que esten por debajo del lmite marcado (filter) - */ -void weightRepository::wrReadMergeModel(FILE *in,float filter) -{ - char c=fgetc(in),key[200],value[100],*endptr; - weight_struct_t *obj; - char garbage[512]; - - strcpy(key,""); - strcpy(value,""); - while (!feof(in)) - { - //c = fgetc(in); - if (c!='#') - { - obj = new weight_struct_t; - strcpy(obj->key,""); - - while (c!=' ') - { - sprintf(obj->key,"%s%c",obj->key,c); - c=fgetc(in); - } - - obj->hash = new hash_t; - hash_init(obj->hash,10); - - while ((c!='\n') && (!feof(in))) - { - weight_node_t *w = new weight_node_t; - - c = wrSaltarBlancs(in,c,0); - strcpy(w->pos,""); strcpy(value,""); - while ((c!=':') && (!feof(in))) - { - sprintf(w->pos,"%s%c",w->pos,c); - c=fgetc(in); - } - - c = wrSaltarBlancs(in,c,0); - - while ((c!=' ') && (c!='\n') && (!feof(in)) ) - { - sprintf(value,"%s%c",value,c); - c=fgetc(in); - } - - w->data=atof(value); - if ( absolut(w->data) > absolut(filter) ) - hash_insert(obj->hash,w->pos,(uintptr_t) w); - else delete w; - } - - c = wrSaltarBlancs(in,c,1); - - hash_insert(&wr,obj->key, (uintptr_t) obj); - } - else - { //while(c=fgetc(in)!='\n'); - fgets(garbage,512,in); - c = fgetc(in); - } - } -} - - -/***********************************************************/ - -/* - * long double weightRepository::wrGetWeight(const char *feature,char *pos) - * Parmetros: - * char *feature: Atributo - * char *pos: Etiqueta morfosintctica - * Lee el peso para el atributo y la etiqueta recibidos como parmetro. - */ -long double weightRepository::wrGetWeight(const char *feature,char *pos) -{ - uintptr_t h = hash_lookup(&wr,feature); - if (h!=HASH_FAIL) - { - weight_struct_t *obj = (weight_struct_t *)h; - uintptr_t w = hash_lookup(obj->hash,pos); - - if (w!=HASH_FAIL) - { - weight_node_t *ret = (weight_node_t *)w; - return ret->data; - } - } - return 0; -} - - -/***********************************************************/ - -/* - * weightRepository(char *fileName,float filter) - * Parmetros: - * char *fileName : Nombre del fichero - * float filter: Valor para filtrar los pesos que se lean - * Constructor que carga el deposito de pesos del fichero llamado - * fileName , filltrando los pesos que esten por debajo del lmite - * marcado (filter) - */ -weightRepository::weightRepository(char *fileName,float filter) -{ - FILE *in; - if ((in = fopen(fileName, "rt"))== NULL) - { - fprintf(stderr, "Error opening weightRepository: %s. It's going to work without it.\n",fileName); - exit(0); - } - hash_init(&wr,10000); - wrReadMergeModel(in,filter); - fclose(in); -} - - -/***********************************************************/ - -/* - * weightRepository() - * Contructor - */ -weightRepository::weightRepository() -{ - hash_init(&wr,10000); -} - - -/***********************************************************/ - -/* - * ~weightRepository() - * Destructor - */ -weightRepository::~weightRepository() -{ - - weight_struct_t *aux; - - hash_t *tptr = ≀ - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size; - - old_bucket=tptr->bucket; - old_size=tptr->size; - //Recorre las listas de sinnimos de la tabla de hash - //eliminando los datos - for (int i=0; inext; - - aux = (weight_struct_t *) tmp->data; - - delete aux; - aux = NULL; - } /* while */ - } /* for */ - - hash_destroy(&wr); -} - - -/*******************************************************/ - -/* - * void wrAddPOS(uintptr_t obj, char* pos, long double weight) - * Parmetros: - * int obj: Apuntador al objeto que contiene el atributo - * char *pos: Etiqueta a insertar: - * long double weight: Peso a asignar a la etiqueta - * Insertamos un nuevo peso para la etiqueta pos, en el atributo indicado - * por obj. Si la etiqueta ya existe se incrementa el peso con weight. Si - * no existe se aade. - */ -void weightRepository::wrAddPOS(uintptr_t obj, char* pos, long double weight) -{ - weight_struct_t *wst = (weight_struct_t *)obj; - uintptr_t x = hash_lookup( wst->hash, pos); - - if (x==HASH_FAIL) - { - //Insertamos Nueva POS - weight_node_t *w = new weight_node_t; - strcpy(w->pos,pos); - w->data=weight; - hash_insert( wst->hash,w->pos,(uintptr_t) w); - } - else - { //Si POS ya esta, incrementamos el peso - weight_node_t *wnt = (weight_node_t *)x; - wnt->data = wnt->data + weight; - } -} - - -/*******************************************************/ - -/* - * void wrAdd(char *feature, char* pos, long double weight) - * Parmetros: - * char *feature: Atributo a insertar - * char *pos: Etiqueta a insertar - * long double weight: Peso a asignar a la etiqueta - * Insertamos un nuevo peso para para el atributo feature y la etiqueta pos. - */ -void weightRepository::wrAdd(char *feature, char* pos, long double weight) -{ - weight_struct_t *obj = (weight_struct_t *)hash_lookup(&wr,feature); - - if ( (uintptr_t) obj == HASH_FAIL) - { - // Creamos nueva entrada en WeightRepository - obj = new weight_struct_t; - strcpy(obj->key,feature); - obj->hash = new hash_t; - hash_init(obj->hash,10); - //Aadimos el peso y la etiqueta - wrAddPOS((uintptr_t)obj,pos,weight); - hash_insert(&wr,obj->key, (uintptr_t) obj); - } - else - //Aadimos el peso y la etiqueta - wrAddPOS((uintptr_t)obj,pos,weight); -} - - -/*******************************************************/ - -/* - * wrWrite(const char *outName) - * Escribe el depsito de pesos en el fichero con nombre outName. - * - * Modificacin 180705: - * Aadimos el parmetro "float filter", se utiliza para filtrar pesos - */ -void weightRepository::wrWrite(const char *outName, float filter) -{ - weight_struct_t *wst; - FILE *f; - - if ((f = fopen(outName, "w"))== NULL) - { - fprintf(stderr, "Error opening file: %s\n",outName); - exit(0); - } - - hash_t *tptr = ≀ - - hash_node_t *node, *last; - int i; - - //Recorremos el hash objeto a objeto - for (i=0; isize; i++) - { - node = tptr->bucket[i]; - while (node != NULL) - { - last = node; - node = node->next; - wst = (weight_struct_t *) last->data; - - //Modificacin 180705: aadimos filtrado de pesos - //char *mrg = wrGetMergeInput(wst->hash); //DEL 180705 - //ADD 180705 - char *mrg = wrGetMergeInput(wst->hash,filter); - - if (strcmp(mrg,"")!=0) fprintf(f,"%s%s\n",wst->key,mrg); - delete mrg; - } //while - } //for - fclose (f); -} - - -/*******************************************************/ -/* - * char *wrGetMergeInput(hash_t *tptr) - * Devuelve una cadena de caracteres con todas las parejas - * POS/PESO contenidas en el hash (tptr) de un atributo. - * - * Modificacin 180705: - * Aadimos parmetro "float filter" para filtrado de pesos - */ -char *weightRepository::wrGetMergeInput(hash_t *tptr, float filter) -{ - char *out = new char[3000]; - weight_node_t *wnt; - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - - old_bucket=tptr->bucket; - old_size=tptr->size; - strcpy (out,""); - - for (i=0; inext; - wnt = (weight_node_t *) tmp->data; - - if ((float)wnt->data!=0) - { - //Modificacin 180705: Filtrado de pesos - //Comprobamos que el peso a insertar en el fichero - //cumple con el filtrado de pesos. - //ADD 180705 - if ( absolut(wnt->data) > absolut(filter) ) - sprintf(out,"%s %s:%.18E",out,wnt->pos, (float) wnt->data); - } - } //while - } //for - return out; -} - - -/*******************************************************/ - -/* - * void wrWriteHash(hash_t *tptr,FILE *f, char separador) - * Escribe el contenido de un hash (tptr), en fichero apuntado por f. - * Entre cada pareja POS/PESO pone el caracter separador. - */ -void weightRepository::wrWriteHash(hash_t *tptr,FILE *f, char separador) -{ - weight_node_t *wnt; - hash_node_t **old_bucket, *old_hash, *tmp; - int old_size, h, i; - int cont=0; - - old_bucket=tptr->bucket; - old_size=tptr->size; - - for (i=0; inext; - wnt = (weight_node_t *) tmp->data; - if (separador == '\n' && cont==0) fprintf(f,"%s %2.10f",wnt->pos,(float)wnt->data); - else fprintf(f,"%c%s:%2.10f",separador,wnt->pos,(float)wnt->data); - cont++; - } /* while */ - } /* for */ -} +// kate: replace-tabs on; indent-width 2; indent-mode cstyle; encoding latin15; +/* + * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include "hash.h" +#include "weight.h" + + +/***********************************************************/ + +/* + * El objeto WeightRepository es el encargado de contener los pesos + * para cada pareja POS-feature. Un depósito de pesos está formado por + * un hash de objetos weight_struct_t, conteniendo los atributos (key). + * Cada uno de estos hash es poseedor de un nuevo hash con todas las POS + * para las cuales se ha encontrado el atributo y su respectivo peso. + * (weight_node_t). +*/ + +/***********************************************************/ + +//Definición de weight_struct_t +class weight_struct_t +{ +public: + std::string key; + hash_t *hash; + + // free inner memory and then free the hash. + ~weight_struct_t() + { + hash->hash_destroy(); + } +}; + + + +/***********************************************************/ + +char weightRepository::wrSaltarBlancs(FILE *in, char c,int jmp) +{ + while ((c==':') || (c==' ') || (c=='\n' && jmp==1)) c=fgetc(in); + return c; +} + +/***********************************************************/ + +/* + * void weightRepository::wrReadMergeModel(FILE *in,float filter) + * Parámetros: + * FILE *in : apuntador al fichero que ha de leer + * float filter: Valor para filtrar los pesos que se lean + * Este método carga un depósito de pesos de un fichero (f), filltrando + * los pesos que esten por debajo del límite marcado (filter) + */ +void weightRepository::wrReadMergeModel(FILE *in,float filter) +{ + char c=fgetc(in); + + while (!feof(in)) + { + //c = fgetc(in); + if (c!='#') + { + weight_struct_t *obj = new weight_struct_t; + obj->key = ""; + + while (c!=' ') + { + obj->key.push_back(c);; + c=fgetc(in); + } + + obj->hash = new hash_t(); + obj->hash->hash_init(10); + + while ((c!='\n') && (!feof(in))) + { + std::string pos; + + c = wrSaltarBlancs(in,c,0); + while ((c!=':') && (!feof(in))) + { +// std::cerr << "Adding '" << c << "' to '" << w->pos << "'" << std::endl; + pos.push_back(c);; + c=fgetc(in); + } + + c = wrSaltarBlancs(in,c,0); + + std::string value; + while ((c!=' ') && (c!='\n') && (!feof(in)) ) + { + value.push_back(c);; + c=fgetc(in); + } + + long double data; + std::istringstream iss(value); + iss >> data; + weight_node_t* node = new weight_node_t(); + node->data = data; + node->pos = pos; +// std::cerr << "weightRepository::wrReadMergeModel " << obj->key << " " << pos << " " << data << " " << (long)data << std::endl; + if ( fabsf(data) > fabsf(filter) ) + obj->hash->hash_insert(pos,node); + else delete node; + } + + c = wrSaltarBlancs(in,c,1); + + wr.hash_insert(obj->key, obj); + } + else + { + char garbage[512]; + fgets(garbage,512,in); //while(c=fgetc(in)!='\n'); + c = fgetc(in); + } + } +} + +/***********************************************************/ + +/* + * long double weightRepository::wrGetWeight(std::string feature,std::string pos) + * Parámetros: + * std::string feature: Atributo + * std::string pos: Etiqueta morfosintáctica + * Lee el peso para el atributo y la etiqueta recibidos como parámetro. + */ +long double weightRepository::wrGetWeight(const std::string& feature,const std::string& pos) +{ + weight_struct_t *obj = wr.hash_lookup(feature); + if ((long)obj!=HASH_FAIL) + { + weight_node_t *ret = obj->hash->hash_lookup(pos); + + if ((long)ret!=HASH_FAIL && ret != 0) + { + return ret->data; + } + } + return 0; +} + +/***********************************************************/ + +/* + * weightRepository(std::string fileName,float filter) + * Parámetros: + * std::string fileName : Nombre del fichero + * float filter: Valor para filtrar los pesos que se lean + * Constructor que carga el depóosito de pesos del fichero llamado + * fileName , filltrando los pesos que esten por debajo del límite + * marcado (filter) + */ +weightRepository::weightRepository(const std::string& fileName,float filter) +{ +// std::cerr << "weightRepository::weightRepository " << fileName << std::endl; + FILE *in; + if ((in = fopen(fileName.c_str(), "rt"))== NULL) + { + fprintf(stderr, "Error opening weightRepository: %s. It's going to work without it.\n",fileName.c_str()); + exit(0); + } + wr.hash_init(10000); + wrReadMergeModel(in,filter); + fclose(in); +} + +/***********************************************************/ + +/* + * weightRepository() + * Contructor + */ +weightRepository::weightRepository() +{ + wr.hash_init(10000); +} + +/***********************************************************/ + +/* + * ~weightRepository() + * Destructor + */ +weightRepository::~weightRepository() +{ + wr.hash_destroy(); +} + +/*******************************************************/ + +/* + * void wrAddPOS(int obj, std::string pos, long double weight) + * Parámetros: + * int obj: Apuntador al objeto que contiene el atributo + * std::string pos: Etiqueta a insertar: + * long double weight: Peso a asignar a la etiqueta + * Insertamos un nuevo peso para la etiqueta pos, en el atributo indicado + * por obj. Si la etiqueta ya existe se incrementa el peso con weight. Si + * no existe se añade. + */ +void weightRepository::wrAddPOS(long unsigned int obj, const std::string& pos, long double weight) +{ + weight_struct_t *wst = (weight_struct_t *)obj; + weight_node_t *wnt = wst->hash->hash_lookup(pos); + + if ((long)wnt==HASH_FAIL) + { + //Insertamos Nueva POS + weight_node_t *w = new weight_node_t; + w->pos = pos; + w->data=weight; + wst->hash->hash_insert(w->pos,w); + } + else + { //Si POS ya esta, incrementamos el peso + wnt->data = wnt->data + weight; + } +} + +/*******************************************************/ + +/* + * void wrAdd(std::string feature, std::string pos, long double weight) + * Parámetros: + * std::string feature: Atributo a insertar + * std::string pos: Etiqueta a insertar + * long double weight: Peso a asignar a la etiqueta + * Insertamos un nuevo peso para para el atributo feature y la etiqueta pos. + */ +void weightRepository::wrAdd(const std::string& feature, const std::string& pos, long double weight) +{ + weight_struct_t *obj = wr.hash_lookup(feature); + + if ( (long) obj == HASH_FAIL) + { + // Creamos nueva entrada en WeightRepository + obj = new weight_struct_t; + obj->key = feature; + obj->hash = new hash_t; + obj->hash->hash_init(10); + //Añadimos el peso y la etiqueta + wrAddPOS((unsigned long)obj,pos,weight); + wr.hash_insert(obj->key, obj); + } + else + //Añadimos el peso y la etiqueta + wrAddPOS((unsigned long)obj,pos,weight); +} + +/*******************************************************/ + +/* + * wrWrite(std::string outName) + * Escribe el depósito de pesos en el fichero con nombre outName. + * + * Modificación 180705: + * Añadimos el parámetro "float filter", se utiliza para filtrar pesos + */ +void weightRepository::wrWrite(const std::string& outName, float filter) +{ + FILE *f; + + if ((f = fopen(outName.c_str(), "w"))== NULL) + { + fprintf(stderr, "Error opening file: %s\n",outName.c_str()); + exit(0); + } + + //Recorremos el hash objeto a objeto + for (hash_t::iterator it = wr.begin(); it != wr.end(); it++) + { + weight_struct_t *wst = (weight_struct_t *) ((*it).second); + + //Modificación 180705: añadimos filtrado de pesos + //std::string mrg = wrGetMergeInput(wst->hash); //DEL 180705 + std::string mrg = wrGetMergeInput(wst->hash,filter); //ADD 180705 + + if (!mrg.empty()) fprintf(f,"%s%s\n",wst->key.c_str(),mrg.c_str()); + }//for + fclose (f); +} + +/*******************************************************/ +/* + * std::string wrGetMergeInput(hash_t *tptr) + * Devuelve una cadena de caracteres con todas las parejas + * POS/PESO contenidas en el hash (tptr) de un atributo. + * + * Modificación 180705: + * Añadimos parámetro "float filter" para filtrado de pesos + */ +std::string weightRepository::wrGetMergeInput(hash_t *tptr, float filter) +{ + std::ostringstream out; + + for (hash_t::iterator it = wr.begin(); it != wr.end(); it++) + { + weight_node_t *wnt = (weight_node_t *) ((*it).second); + + if ((float)wnt->data!=0) + { + //Modificación 180705: Filtrado de pesos + //Comprobamos que el peso a insertar en el fichero + //cumple con el filtrado de pesos. + if ( fabsf(wnt->data) > fabsf(filter) ) //ADD 180705 + // %s %s:%.18E + out << " " << wnt->pos << ":" << (float) wnt->data; + } + } //for + return out.str(); +} + +/*******************************************************/ + +/* + * void wrWriteHash(hash_t *tptr,FILE *f, char separador) + * Escribe el contenido de un hash (tptr), en fichero apuntado por f. + * Entre cada pareja POS/PESO pone el caracter separador. + */ +void weightRepository::wrWriteHash(hash_t *tptr,FILE *f, char separador) + +{ + int cont=0; + + for (hash_t::iterator it = wr.begin(); it != wr.end(); it++) + { + weight_node_t *wnt = (weight_node_t *) ((*it).second); + + if (separador == '\n' && cont==0) fprintf(f,"%s %2.10f",wnt->pos.c_str(),(float)wnt->data); + else fprintf(f,"%c%s:%2.10f",separador,wnt->pos.c_str(),(float)wnt->data); + cont++; + } /* for */ +} + -- 2.11.4.GIT