lib/learner.cc

   1 /*
   2  * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17  */
  18
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21 #include <string.h>
  22 #include <unistd.h>
  23 #include "marks.h"
  24 #include "hash.h"
  25 #include "list.h"
  26 #include "dict.h"
  27 #include "stack.h"
  28 #include "swindow.h"
  29 #include "mapping.h"
  30 #include "weight.h"
  31 #include "learner.h"
  32 #include "common.h"
  33
  34 /**************************************************************/
  35
  36 extern int verbose;
  37 double time_svmlight = 0;
  38
  39 /**************************************************************/
  40
  41 stack_t DO;
  42 int KERNEL=0;
  43 int DEGREE=0;
  44 float CK = 0;
  45 float CU = 0;
  46 float X  = 3;
  47 int MAX_MAPPING_SIZE = 100000;
  48 int COUNT_CUT_OFF = 2;
  49 int WINDOW_SIZE = 5;
  50 int CORE_POSITION = 2;
  51 char *TRAINSET = NULL;
  52 char *SVMDIR = NULL;
  53 char *NAME = NULL;
  54 char *BLEX = NULL;
  55 char *R = NULL;
  56 float DRATIO = 0.001;
  57 float ERATIO=0;
  58 float KFILTER = 0;
  59 float UFILTER = 0;
  60 int   REMOVE_FILES = TRUE;
  61
  62 /**************************************************************/
  63
  64 char *UP = NULL;
  65 char *AP = NULL;
  66
  67 /**************************************************************/
  68
  69 //ambiguous-right [default]
  70 const char *A0 = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\n";
  71 const char *A0UNK = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\na(2)\na(3)\na(4)\nz(2)\nz(3)\nz(4)\nca(1)\ncz(1)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n";
  72
  73 /**************************************************************/
  74
  75 //unambiguous-right
  76 const char *A1 =  "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(1)\np(2)\np(-2,-1)\np(-1,0)\np(-1,1)\np(0,1)\np(1,2)\np(-2,-1,0)\np(-2,-1,1)\np(-1,0,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\n";
  77 const char *A1UNK =  "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(1)\np(2)\np(-2,-1)\np(-1,0)\np(-1,1)\np(0,1)\np(1,2)\np(-2,-1,0)\np(-2,-1,1)\np(-1,0,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n";
  78
  79 /**************************************************************/
  80
  81 //no-right
  82 const char *A2 = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\nk(0)\nm(0)\n";
  83 const char *A2UNK = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\nk(0)\nm(0)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n";
  84
  85 /**************************************************************/
  86
  87 //unsupervised-learning
  88 const char *A3 = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2))\np(-2))\np(-1))\np(-2,-1))\np(-1,1))\np(1,2))\np(-2,-1,1))\np(-1,1,2))\nk(-2))\nk(-1))\nk(1))\nk(2)\n)\nm(-2))\nm(-1))\nm(1))\nm(2)\n";
  89 const char *A3UNK = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2))\np(-2))\np(-1))\np(-2,-1))\np(-1,1))\np(1,2))\np(-2,-1,1))\np(-1,1,2))\nk(-2))\nk(-1))\nk(1))\nk(2)\n)\nm(-2))\nm(-1))\nm(1))\nm(2)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n";
  90
  91 /**************************************************************/
  92
  93 //ambiguous-right ++ unknown words on training
  94 const char *A4 = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\n";
  95 const char *A4UNK = "w(-2)\nw(-1)\nw(0)\nw(1)\nw(2)\nw(-2,-1)\nw(-1,0)\nw(0,1)\nw(-1,1)\nw(1,2)\nw(-2,-1,0)\nw(-2,-1,1)\nw(-1,0,1)\nw(-1,1,2)\nw(0,1,2)\np(-2)\np(-1)\np(-2,-1)\np(-1,1)\np(1,2)\np(-2,-1,1)\np(-1,1,2)\nk(0)\nk(1)\nk(2)\nm(0)\nm(1)\nm(2)\na(1)\na(2)\na(3)\na(4)\nz(1)\nz(2)\nz(3)\nz(4)\nL\nSA\nAA\nSN\nCA\nCAA\nCP\nCC\nCN\nMW\n";
  96
  97 /**************************************************************/
  98
  99 char *learner::read_feature_list_from_config_file(FILE *f, char *first_feature)
 100 {
 101   char tmp[1000],str[500],*out;
 102   strcpy(tmp,"");
 103   sprintf(tmp,"%s\n",first_feature);
 104
 105   int cont=1;
 106   int ret=1;
 107
 108   while (ret>0)
 109     {
 110       strcpy(str,"");
 111       ret = readTo(f,' ','\n',str);
 112       if (ret>=0)
 113         {
 114           sprintf(tmp,"%s%s\n",tmp,str);
 115           cont++;
 116         }
 117     }
 118   out = new char[strlen(tmp)+1];
 119   strcpy(out,tmp);
 120   return out;
 121 }
 122
 123 /**************************************************************/
 124
 125 void learner::read_config_file(const char *config_file)
 126 {
 127   int ret1=1,ret2=1,ret3=1;
 128   char str1[500],str2[500],str3[500];
 129
 130   FILE *f = openFile(config_file,"r");
 131   FILE *tmp = openFile("tmp_config.svmt","w+");
 132   char c,ant;
 133   ant = '~';
 134   while (!feof(f))
 135     {
 136       c = fgetc(f);
 137       if (c=='#') readTo(f,'\n','\n',str1);
 138       else if (!(c == ' ' && ant == ' ')) fprintf(tmp,"%c",c);
 139       ant = c;
 140     }
 141   fclose(f);
 142
 143   fseek(tmp,0,SEEK_SET);
 144   while (!feof(tmp))
 145     {
 146       strcpy(str1,""); strcpy(str2,""); strcpy(str3,"");
 147       ret1 = readTo(tmp,' ','=',str1);
 148       if (ret1>=0 && strcmp(str1,"do")==0)
 149         {
 150           ret2 = readTo(tmp,' ',' ',str2);
 151           ret3 = readTo(tmp,' ','\n',str3);
 152           char *p = strstr(str2,"M");
 153           int *modelo = new int;
 154           int *direction = new int;
 155           *modelo = atoi(&p[1]);
 156           if (strcmp(str3,"LR")==0) *direction = LEFT_TO_RIGHT;
 157           else if (strcmp(str3,"RL")==0) *direction = RIGHT_TO_LEFT;
 158           else if (strcmp(str3,"LRL")==0) *direction = LR_AND_RL;
 159           push(&DO,direction);
 160           push(&DO,modelo);
 161         }
 162       else if (ret1>=0 && strcmp(str1,"")!=0)
 163         {
 164           ret2 = readTo(tmp,' ',' ',str2);
 165           ret2 = readTo(tmp,' ','\n',str2);
 166           if (ret1>=0 && ret2>=0)
 167             {
 168               if (strcmp(str1,"A0k")==0 || strcmp(str1,"A0")==0)        A0 = read_feature_list_from_config_file(tmp,str2);
 169               else if (strcmp(str1,"A1k")==0 || strcmp(str1,"A1")==0) A1 = read_feature_list_from_config_file(tmp,str2);
 170               else if (strcmp(str1,"A2k")==0 || strcmp(str1,"A2")==0) A2 = read_feature_list_from_config_file(tmp,str2);
 171               else if (strcmp(str1,"A3k")==0 || strcmp(str1,"A3")==0) A3 = read_feature_list_from_config_file(tmp,str2);
 172               else if (strcmp(str1,"A4k")==0 || strcmp(str1,"A4")==0) A4 = read_feature_list_from_config_file(tmp,str2);
 173               else if (strcmp(str1,"A0u")==0 || strcmp(str1,"A0unk")==0) A0UNK = read_feature_list_from_config_file(tmp,str2);
 174               else if (strcmp(str1,"A1u")==0 || strcmp(str1,"A1unk")==0) A1UNK = read_feature_list_from_config_file(tmp,str2);
 175               else if (strcmp(str1,"A2u")==0 || strcmp(str1,"A2unk")==0) A2UNK = read_feature_list_from_config_file(tmp,str2);
 176               else if (strcmp(str1,"A3u")==0 || strcmp(str1,"A3unk")==0) A3UNK = read_feature_list_from_config_file(tmp,str2);
 177               else if (strcmp(str1,"A4u")==0 || strcmp(str1,"A4unk")==0) A4UNK = read_feature_list_from_config_file(tmp,str2);
 178               else if (strcmp(str1,"F")==0 && ret3>=0)
 179                 {
 180                   ret3 = readTo(tmp,'\n','\n',str3);
 181                   MAX_MAPPING_SIZE = atoi(str3);
 182                   COUNT_CUT_OFF = atoi (str2);
 183                 }
 184               else if (strcmp(str1,"W")==0)
 185                 {
 186                   ret3 = readTo(tmp,'\n','\n',str3);
 187                   WINDOW_SIZE = atoi(str2);
 188                   CORE_POSITION = atoi (str3);
 189                 }
 190               else if (strcmp(str1,"TRAINSET")==0)
 191                 {
 192                   TRAINSET = new char[strlen(str2)+1];
 193                   strcpy(TRAINSET,str2);
 194                 }
 195               else if (strcmp(str1,"BLEX")==0)
 196                 {
 197                   BLEX = new char[strlen(str2)+1];
 198                   strcpy(BLEX,str2);
 199                 }
 200               else if (strcmp(str1,"R")==0)
 201                 {
 202                   R = new char[strlen(str2)+1];
 203                   strcpy(R,str2);
 204                 }
 205               else if (strcmp(str1,"SVMDIR")==0)
 206                 {
 207                   SVMDIR = new char[strlen(str2)+1];
 208                   strcpy(SVMDIR,str2);
 209                 }
 210               else if (strcmp(str1,"NAME")==0)
 211                 {
 212                   NAME = new char[strlen(str2)+1];
 213                   strcpy(NAME,str2);
 214                 }
 215               else if (strcmp(str1,"REMOVE_FILES")==0)
 216                 {
 217                   REMOVE_FILES = atoi (str2);
 218                 }
 219               else if (strcmp(str1,"CK")==0)
 220                 {
 221                   CK = atof (str2);
 222                 }
 223               else if (strcmp(str1,"CU")==0)
 224                 {
 225                   CU = atof (str2);
 226                 }
 227               else if (strcmp(str1,"Dratio")==0)
 228                 {
 229                   DRATIO = atof (str2);
 230                 }
 231               else if (strcmp(str1,"Eratio")==0)
 232                 {
 233                   ERATIO = atof (str2);
 234                 }
 235               else if (strcmp(str1,"Kfilter")==0)
 236                 {
 237                   KFILTER = atof (str2);
 238                 }
 239               else if (strcmp(str1,"Ufilter")==0)
 240                 {
 241                   UFILTER = atof (str2);
 242                 }
 243               else if (strcmp(str1,"X")==0)
 244                 {
 245                   X  = atof (str2);
 246                 }
 247               else if (strcmp(str1,"AP")==0)
 248                 {
 249                   int ret  = 1;
 250                   learnerAMBP_H = new hash_t;
 251                   hash_init(learnerAMBP_H,30);
 252
 253                   infoDict * etiq = new infoDict;
 254                   strcpy(etiq->txt,str2);
 255                   hash_insert(learnerAMBP_H,etiq->txt,(uintptr_t) etiq);
 256
 257                   if (ret2>0)
 258                   {
 259                     memset(str2,0,strlen(str2));
 260                     fgets(str2,500,tmp);
 261
 262                     for (int i=0; i<strlen(str2);i++)
 263                       {
 264                         memset(str3,0,strlen(str3));
 265                         sscanf(str2+i,"%s",str3);
 266                         i = strlen(str3) + i;
 267
 268                         etiq = new infoDict;
 269                         strcpy(etiq->txt,str3);
 270                         hash_insert(learnerAMBP_H,etiq->txt,(uintptr_t) etiq);
 271                       } //EndFor
 272                   } //End if(ret2>0)
 273                 }
 274               else if (strcmp(str1,"UP")==0)
 275                 {
 276                   int ret  = 1;
 277                   learnerUNKP_H = new hash_t;
 278                   hash_init(learnerUNKP_H,30);
 279
 280                   infoDict * etiq = new infoDict;
 281                   strcpy(etiq->txt,str2);
 282                   hash_insert(learnerUNKP_H,etiq->txt,(uintptr_t) etiq);
 283                   if (ret2>0)
 284                     {
 285                       memset(str2,0,strlen(str2));
 286                       fgets(str2,500,tmp);
 287
 288                       for (int i=0; ret2>0 && i<strlen(str2);i++)
 289                         {
 290                           memset(str3,0,strlen(str3));
 291                           sscanf(str2+i,"%s",str3);
 292                           i = strlen(str3) + i;
 293
 294                           etiq = new infoDict;
 295                           strcpy(etiq->txt,str3);
 296                           hash_insert(learnerUNKP_H,etiq->txt,(uintptr_t) etiq);
 297                         } //Endfor
 298                     } //End if(ret2>0)
 299                 }
 300             }
 301         }
 302     }
 303   fclose (tmp);
 304
 305   if (verbose==TRUE)
 306     {
 307         fprintf(stderr,"\n* ===================== SVMTlearn configuration ==========================");
 308         fprintf(stderr,"\n* config file   = [ %s ]\n* trainset      = [ %s ]\n* model name    = [ %s ]",config_file,TRAINSET,NAME);
 309         fprintf(stderr,"\n* SVM-light dir = [ %s ]",SVMDIR);
 310         fprintf(stderr,"\n* ========================================================================");
 311         fprintf(stderr,"\n* unknown words expected   = [ X  = %f  % ]",X);
 312         fprintf(stderr,"\n* C parameter for known    = [ CK = %f ]",CK);
 313         fprintf(stderr,"\n* C parameter for unknown  = [ CU = %f ]",CU);
 314         fprintf(stderr,"\n* D ratio                  = [ Dratio = %f ]",DRATIO);
 315         fprintf(stderr,"\n* E ratio                  = [ Eratio = %f ]",ERATIO);
 316         fprintf(stderr,"\n* Known weights filter     = [ Kfilter = %f ]",KFILTER);
 317         fprintf(stderr,"\n* Unknown weights filter   = [ Ufilter = %f ]",UFILTER);
 318         fprintf(stderr,"\n* sliding window settings  = [ WINDOW SIZE = %d , CORE POSITION = %d ]",WINDOW_SIZE,CORE_POSITION);
 319         fprintf(stderr,"\n* mapping settings         = [ COUNT CUT OFF = %d , MAX MAPPING SIZE = %d ]",COUNT_CUT_OFF,MAX_MAPPING_SIZE);
 320         fprintf(stderr,"\n* remove temporal files    = [ %d ] (1) TRUE, (0) FALSE",REMOVE_FILES);
 321         fprintf(stderr,"\n* ========================================================================");
 322     }
 323
 324   system("rm -f tmp_config.svmt");
 325
 326   if (TRAINSET == NULL) { fprintf (stderr,"\nError: TRAINSET parameter not found in %s.\n",config_file); exit(-1); }
 327   if (NAME == NULL) { fprintf (stderr,"\nError: MODEL NAME parameter not found in %s.\n",config_file); exit(-1); }
 328   if (SVMDIR == NULL) {  fprintf (stderr,"\nError: SVM DIRECTORY parameter not found in %s.\n",config_file); exit(-1); }
 329 }
 330
 331 /**************************************************************/
 332
 333 learner::learner()
 334 {
 335   learnerAMBP_H = NULL ;
 336   learnerUNKP_H = NULL ;
 337 }
 338
 339 learner::~learner()
 340 {
 341  char str[1000];
 342  strcpy(str,"");
 343  if (REMOVE_FILES==TRUE)
 344  {
 345    removeFiles(NAME,RM_TEMP_FILES,0,0,verbose);
 346  }
 347
 348  if ( verbose == TRUE ) fprintf(stderr,"\n\nTERMINATION ... ");
 349
 350  if (TRAINSET!=NULL) delete TRAINSET;
 351  if (SVMDIR!=NULL) delete SVMDIR;
 352  if (NAME!=NULL) delete NAME;
 353  if (BLEX!=NULL) delete BLEX;
 354  if (R!=NULL) delete R;
 355
 356  if ( verbose == TRUE ) fprintf(stderr,"[DONE]\n\n");
 357 }
 358
 359 /***************************************************************/
 360
 361 void learner::learnerCreatePOSFile(char *modelName, int is_ambp, hash_t *h)
 362 {
 363   char name[300];
 364   if (is_ambp==TRUE)
 365     {
 366       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.AMBP",modelName);
 367       sprintf(name,"%s.AMBP",modelName);
 368       FILE *f = openFile (name,"w");
 369       hash_print(h,f);
 370       fclose(f);
 371     }
 372   else
 373     {
 374       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.UNKP",modelName);
 375       sprintf(name,"%s.UNKP",modelName);
 376       FILE *f = openFile (name,"w");
 377       hash_print(h,f);
 378       fclose(f);
 379     }
 380 }
 381
 382 /***************************************************************/
 383
 384 void learner::learnerCreateDefaultFile(const char *modelName, const char *str)
 385 {
 386   char name[500];
 387   sprintf (name,"%s.%s",modelName,str);
 388   FILE *f = openFile(name, "w");
 389
 390   if (strcmp(str,"A0")==0)
 391   {
 392       if ( verbose == TRUE )  fprintf(stderr,"\nStoring %s.A0",modelName); fprintf(f,"%s",A0);
 393       sprintf (name,"%s.%s.UNK",modelName,str);
 394       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A0.UNK",modelName);
 395       FILE *funk = openFile(name, "w");
 396       fprintf(funk,"%s",A0UNK);
 397       fclose(funk);
 398   }
 399   else if (strcmp(str,"A1")==0)
 400   {
 401       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A1",modelName); fprintf(f,"%s",A1);
 402       sprintf (name,"%s.%s.UNK",modelName,str);
 403       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A1.UNK",modelName);
 404       FILE *funk = openFile(name, "w");
 405       fprintf(funk,"%s",A1UNK);
 406       fclose(funk);
 407   }
 408   else if (strcmp(str,"A2")==0)
 409   {
 410       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A2",modelName); fprintf(f,"%s",A2);
 411       sprintf (name,"%s.%s.UNK",modelName,str);
 412       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A2.UNK",modelName);
 413       FILE *funk = openFile(name, "w");
 414       fprintf(funk,"%s",A2UNK);
 415       fclose(funk);
 416   }
 417   else if (strcmp(str,"A3")==0)
 418   {
 419       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A3",modelName); fprintf(f,"%s",A3);
 420       sprintf (name,"%s.%s.UNK",modelName,str);
 421       if ( verbose == TRUE )  fprintf(stderr,"\nStoring %s.A3.UNK",modelName);
 422       FILE *funk = openFile(name, "w");
 423       fprintf(funk,"%s",A3UNK);
 424       fclose(funk);
 425   }
 426   else if (strcmp(str,"A4")==0)
 427   {
 428       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A4",modelName); fprintf(f,"%s",A4);
 429       sprintf (name,"%s.%s.UNK",modelName,str);
 430       if ( verbose == TRUE ) fprintf(stderr,"\nStoring %s.A4.UNK",modelName);
 431       FILE *funk = openFile(name, "w");
 432       fprintf(funk,"%s",A4UNK);
 433       fclose(funk);
 434   }
 435   else if (strcmp(str,"WIN")==0)
 436     {
 437       if ( verbose == TRUE )  fprintf(stderr,"\nStoring %s.WIN",modelName);
 438       fprintf(f,"%d\n%d\n",WINDOW_SIZE,CORE_POSITION);
 439     }
 440   fclose(f);
 441 }
 442
 443 /***************************************************************/
 444
 445 /*
 446  * Return CHAR_NULL if end of file
 447  * Return w,p,k (if we readed a) or s (if we readed m) if it's found
 448  */
 449 char learner::obtainAtrChar(FILE *channel)
 450 {
 451   char c;
 452   while (!feof(channel))
 453     {
 454       c=fgetc(channel);
 455       if (c=='w' || c=='p' || c=='k' || c=='m')
 456         {
 457           fgetc(channel);
 458           switch (c)
 459             {
 460             case 'k': return 'k';
 461             case 'm': return 's';
 462             default:  return c;
 463             }
 464         }
 465     }
 466   return CHAR_NULL;
 467 }
 468
 469 /**************************************************************/
 470
 471 /*
 472  * Cada atributo de una lista de atributos tiene la forma
 473  * <Marca>(<cadena de enteros separados por comas>)
 474  * Usaremos esta función para leer cada uno de los enteros de
 475  * la cadena que indica las posiciones de la ventana a considerar.
 476  * Lee del fichero (channel) un entero entre '(' y coma, comas,
 477  * coma y ')’ o paréntesis. Devuelve el número leído como entero.
 478  */
 479 int learner::obtainAtrInt(FILE *channel,int *endAtr)
 480 {
 481   int i=0;
 482   char c=' ',num[5]="";
 483
 484   while ( (!feof(channel)) && (c!='(') && (c!=',') && (c!=')') )
 485     {
 486       c=fgetc(channel);
 487       if ((c!='(') && (c!=')')) num[i]=c;
 488       i++;
 489     }
 490   if (c==')') *endAtr=1;
 491   num[i]='\0';
 492   return atoi(num);
 493 }
 494
 495 /***************************************************************/
 496
 497 /* int learnerCount(char *name, int *nWords, int *nSentences)
 498  * Cuenta el numero de palabras y frases que tiene el corpus
 499  * contenido en el fichero con nombre <name>. El numero de
 500  * palabras y de frases es devuelto como parametro de salida
 501  * mediante <nWords> y <nSentences>.
 502  */
 503 void learner::learnerCount(char* name, int *nWords, int *nSentences)
 504 {
 505   int ret1=0,ret2=0,cont=0,contWords=0;
 506   char c=' ',str[300];
 507
 508   FILE *f = openFile(name, "r");
 509   while (!feof(f) && ret1>=0 && ret2>=0)
 510     {
 511       ret1 = readTo(f,' ',0,str);
 512       if (strcmp(str,"!")==0 || strcmp(str,"?")==0 || strcmp(str,".")==0) cont++;
 513       ret2 = readTo(f,'\n',0,str);
 514       if ( !feof(f) && ret1 >= 0 ) contWords++;
 515     }
 516   fclose(f);
 517
 518   if ( contWords <= 1 )
 519     {
 520       fprintf(stderr,"\n\nInput corpus too short to begin training!! Program stopped.\n\n");
 521       exit(0);
 522     }
 523
 524   *nWords = contWords;
 525   *nSentences = cont;
 526 }
 527
 528
 529 /********************************************************/
 530
 531 void learner::learnerPrintMessage(int numModel, int K_or_U, int LR_or_RL, int is_fex)
 532 {
 533   if (verbose==TRUE)
 534     {
 535       fprintf(stderr,"\n\n* ========================================================================");
 536       if (is_fex==FALSE) fprintf(stderr,"\n* TRAINING MODEL %d ",numModel);
 537       else fprintf(stderr,"\n* FEATURES EXTRACTION FOR MODEL %d ",numModel);
 538       if (is_fex==TRUE) fprintf(stderr,"[ KNOWN AND UNKNOWN WORDS  -  ");
 539       else if (K_or_U==KNOWN)   fprintf(stderr,"[ KNOWN WORDS  -  ");
 540       else fprintf(stderr,"[ UNKNOWN WORDS  -  ");
 541       if (LR_or_RL==LEFT_TO_RIGHT) fprintf(stderr,"LEFT TO RIGHT ]");
 542       else if (LR_or_RL==RIGHT_TO_LEFT) fprintf(stderr,"RIGHT TO LEFT ]");
 543       else if (LR_or_RL==LR_AND_RL)fprintf(stderr,"LEFT TO RIGHT AND RIGHT TO LEFT ]");
 544       fprintf(stderr,"\n* ========================================================================");
 545     }
 546 }
 547
 548 /**************************************************************/
 549
 550 /*
 551  * Parámetros:
 552  * wrd          char *  Palabra de ejemplo que se está tratando.
 553  * numModel     int     Modelo que estamos entrenando.
 554  * direction    int     Dirección en la cual se realiza el entrenamiento
 555                         (Derecha a izquierda o izquierda a derecha).
 556  * Known_or_Unknown     int     Si se está entrenando para palabras conocidas
 557                                 o desconocidas.
 558  * pos  char *  La etiqueta morfosintáctica que estamos entrenando.
 559  * samplePos    char *  Etiqueta morfosintáctica del ejemplo  que estamos tratando.
 560  * features     char *  Lista de atributos generados para el ejemplo.
 561  * d    dictionary *    Diccionario que se está usando para el entrenamiento.
 562  * nNeg int *   Apuntador al número de palabras seleccionadas como ejemplos negativos.
 563  * nPos int *   Apuntador al número de palabras seleccionadas como ejemplos positivos.
 564  *
 565  * Este método puede usarse para seleccionar ejemplos para palabras conocidas
 566  */
 567 void learner::learnerPushSample(char *wrd,int numModel,int direction, int Known_or_Unknown,char *pos, char *samplePos, char *features,dictionary *d, int *nNeg, int *nPos)
 568 {
 569     char fileName[100];
 570     //Se abre el fichero donde se debe insertar el ejemplo.
 571     generateFileName(NAME,pos,numModel,direction, Known_or_Unknown, "POS", fileName);
 572
 573     FILE *f = openFile(fileName,"a+");
 574
 575     //Se obtiene la lista de posibles etiquetas morfosintácticas para
 576     //la palabra wrd.
 577     simpleList *l = learnerGetPotser(wrd,Known_or_Unknown,d);
 578     l->setFirst();
 579     for (int stop=1; stop>=0  ; stop = l->next())
 580     {
 581         //Se busca la etiqueta pos en la lista obtenida.
 582         //Si se encuentra y es igual a samplePos se selecciona el ejemplo
 583         //como positivo, si no es igual a samplePos se selecciona como
 584         //negativo.
 585         infoDict *pInfo = (infoDict *) l->getIndex();
 586         if (pInfo != NULL )
 587         {
 588           if (strcmp(pInfo->txt,pos)==0)
 589           {
 590             if (strcmp(pInfo->txt,samplePos)==0)
 591             {
 592                 //Positive Sample
 593                 *nPos=(*nPos)+1;
 594                 fprintf (f,"+1 %s\n",features);
 595             }
 596             else
 597             {
 598                 //Negative Sample
 599                 *nNeg=(*nNeg)+1;
 600                 fprintf (f,"-1 %s\n",features);
 601             }
 602           }
 603         }
 604      }
 605
 606    d->dictCleanListInfoDict(l,l->numElements());
 607    delete l;
 608    //Se cierra el fichero.
 609    fclose(f);
 610 }
 611
 612 /**************************************************************/
 613 /*
 614  * Parámetros:
 615  * wrd  Char *  Palabra del ejemplo que se está tratando.
 616  * numModel     Int     Modelo que estamos entrenando.
 617  * direction    Int     Dirección en la cual se realiza el entrenamiento
 618                         (Derecha a izquierda o izquierda a derecha).
 619  * Known_or_Unknown     Int     Si se está entrenando para palabras conocidas
 620                                 o desconocidas.
 621  * pos  Char *  La etiqueta morfosintáctica que estamos entrenando.
 622  * samplePos    Char *  Etiqueta morfosintáctica del ejemplo  que estamos tratando.
 623  * features     Char *  Lista de atributos generadas para el ejemplo.
 624  * d    dictionary *    Diccionario que se está usando para el entrenamiento.
 625  * nNeg int *   Apuntador al número de palabras seleccionadas como ejemplos negativos.
 626  * nPos int *   Apuntador al número de palabras seleccionadas como ejemplos positivos.
 627  * Este método se encarga de seleccionar ejemplos para palabras desconocidas.
 628 */
 629 void learner::learnerPushSampleUnk(char *wrd,int numModel,int direction, int Known_or_Unknown,char *pos, char *samplePos, char *features,dictionary *d, int *nNeg, int *nPos)
 630 {
 631     char fileName[100];
 632     generateFileName(NAME,pos,numModel,direction, Known_or_Unknown, "POS", fileName);
 633     //Abrimos el fichero
 634     FILE *f = openFile(fileName,"a+");
 635
 636     //Si pos es igual a samplePos se selecciona el ejemplo como positivo,
 637     //en cualquier otro caso se selecciona como negativo.
 638     if (strcmp(samplePos,pos)==0)
 639            {
 640                 //Positive Sample
 641                 *nPos=(*nPos)+1;
 642                 fprintf (f,"+1 %s\n",features);
 643            }
 644     else
 645             {
 646                 //Negative Sample
 647                 *nNeg=(*nNeg)+1;
 648                 fprintf (f,"-1 %s\n",features);
 649             }
 650    //Cerramos el fichero
 651    fclose(f);
 652 }
 653
 654 /**************************************************************/
 655 /*
 656  * Este método recibe como parámetros un apuntador a un fichero (f)
 657  * y el apuntador a un objeto del tipo mapping. El objetivo de este
 658  * método es leer los atributos generados para un ejemplo del fichero
 659  * f y devolver un string con la lista de atributos en el formato
 660  * esperado por SVM-light.
 661  */
 662 char *learner::learnerCreateFeatureString(FILE *f,mapping *m)
 663 {
 664         char *features = new char[1000];
 665         int array[learnerNumFeatures];
 666         int ret1=1,i = 0;
 667         char str[100];
 668
 669         //Construimos un array de enteros con los
 670         //identificadores numéricos de cada atributo
 671         while (ret1>0 && !feof(f))
 672         {
 673           ret1 = readTo(f,' ','\n',str);
 674           int num = m->mappingGetNumberByFeature(str);
 675           if (ret1>=0 && num>-1)
 676             {
 677               array[i]=num;
 678               i++;
 679             }
 680         }
 681
 682         //qsort --> ordena ascendetemente un array de enteros
 683         qsort(array,0,i-1);
 684         sprintf(features,"");
 685
 686         for (int j=0;j<i;j++)
 687         {
 688           //Contruimos la cadena de caracteres con losidentificadores
 689           //numéricos
 690           if (j!=i-1) sprintf(features,"%s%d:1 ",features,array[j]);
 691           else sprintf(features,"%s%d:1",features,array[j]);
 692         }
 693         return features;
 694 }
 695
 696 /**************************************************************/
 697
 698 /*
 699  * Parámetros:
 700  * d    dictionary*     Diccionario que se está usando para el entrenamiento.
 701  * m    Mapping*        Mapping entre atributos e identificadores numéricos
 702  * f    FILE*   Apuntador al fichero de ejemplos
 703  * pos  char *  La etiqueta morfosintáctica que estamos entrenando.
 704  * numModel     int     Modelo que estamos entrenando.
 705  * direction    int     Dirección en la cual se realiza el entrenamiento (Derecha a
 706                         izquierda o izquierda a derecha).
 707  * K_or_U       int     Si se está entrenando para palabras conocidas o desconocidas.
 708  * nNeg int *   Apuntador al número de palabras seleccionadas como ejemplos negativos.
 709  * nPos int *   Apuntador al número de palabras seleccionadas como ejemplos positivos.
 710  *
 711  * Este método lee el fichero en el cual están contenidos los ejemplos seleccionados
 712  * para el entrenamiento.
 713 */
 714 void learner::learnerDressNakedSetTrain(dictionary *d,mapping *m,FILE *f, char* pos,int numModel, int direction, int K_or_U,int *nPos,int *nNeg)
 715 {
 716     char wrd[TAM_WORD], samplePos[TAM_POS];
 717     char *features/*[1000]*/, garbage[1000],str[100];
 718     int array[learnerNumFeatures],cont=0;
 719
 720     *nNeg=0; *nPos=0;
 721
 722     fseek(f,0,SEEK_SET);
 723     //Para cada ejemplo seleccionado
 724     while (!feof(f))
 725     {
 726       int ret1 = readTo(f,':',0,wrd);
 727       int ret2 = readTo(f,' ',0,samplePos);
 728       if (ret1>=0 && ret2>=0)
 729         {
 730             int isPossiblePOS = learnerIsPossiblePOS(wrd,pos,K_or_U);
 731             if (isPossiblePOS==TRUE)
 732             {
 733               //Preparamos la lista de features
 734               features = learnerCreateFeatureString(f,m);
 735
 736               if (K_or_U==KNOWN) learnerPushSample(wrd,numModel,direction,K_or_U,pos,samplePos,features,d,nNeg,nPos);
 737               else learnerPushSampleUnk(wrd,numModel,direction,K_or_U,pos,samplePos,features,d,nNeg,nPos);
 738
 739               if ( verbose  == TRUE) showProcessDone(cont , 1000, FALSE,"samples");
 740
 741               delete features;
 742             }
 743           else readTo(f,'\n','\n',garbage);
 744           cont++;
 745         } //if
 746     } //While
 747 }
 748
 749 /**************************************************************/
 750
 751 /*
 752  * Parámetros:
 753  * f    FILE*   Apuntador al fichero de ejemplos
 754  * numModel     Int     Modelo que estamos entrenando.
 755  * LR_or_RL     Int     Dirección en la cual se realiza el entrenamiento
 756  *                      (Derecha a izquierda o izquierda a derecha).
 757  * K_or_U       Int     Si se está entrenando para palabras conocidas o desconocidas.
 758  * d    dictionary*     Diccionario que se está usando para el entrenamiento.
 759  * lPosToTrain  simpleList *    Lista de etiquetas morfosintácticas a entrenar
 760  *
 761  * Este método es el encargado de realizar el aprendizaje. Prepara las opciones de
 762  * ejecución  para SVM-light.  Construye el mapping mediante los datos contenidos
 763  * en el fichero de entrada. Crea un depósito de pesos (weightRepository) y
 764  * un hashing en el que almacenar los sesgos obtenidos para cada etiqueta.
 765  * Y prepara los datos para que puedan ser procesados por SVM-light.
 766  * Para cada etiqueta morfosintáctica de la lista lPosToTrain, se
 767  * llama al método learnerDressNakedSetTrain para conseguir los ficheros
 768  * de entrada para SVM-light. Una vez hecho esto, se llama a learnerExecSVMlight,
 769  * para ejecutar SVM-light. Con el fichero de salida generado por la herramienta
 770  * de Joachims, se rellenan las estructuras de datos con los pesos
 771  * (learnerBuiltWeightRepository) y los sesgos (learnerBuiltBias)
 772  * Una vez procesadas todas las etiquetas morfosintácticas a entrenar, el depósito
 773  * de pesos y el hashing de sesgos contienen todos los datos del modelo y se escriben
 774  * en disco mediante los métodos weightRepository.wrWriteHash para los sesgos y
 775  * weightRepository.wrWrite para los pesos.
 776  */
 777
 778 void learner::learnerDoLearn(FILE *f,int numModel,int LR_or_RL,int K_or_U,dictionary *d,simpleList *lPosToTrain)
 779 {
 780   char posFileName[100],svmFileName[100],mapFileName[100];
 781
 782   //Preparamos las opciones con que se ejecutará svm-light
 783   char options[100]="";
 784   if (CK!=0 && K_or_U==KNOWN) sprintf(options," -c %.6f ",CK);
 785   else if (CU!=0 && K_or_U==UNKNOWN) sprintf(options," -c %.6f ",CU);
 786   if (KERNEL!=0) sprintf(options,"%s -t %d ",options,KERNEL);
 787   if (DEGREE!=0) sprintf(options,"%s -d %d ",options,DEGREE);
 788
 789   learnerPrintMessage(numModel,K_or_U,LR_or_RL,FALSE);
 790
 791   if ( verbose == TRUE )
 792     {
 793       fprintf(stderr,"\nBuilding MAPPING for MODEL %d [ ",numModel);
 794       if (K_or_U==KNOWN) fprintf(stderr," KNOWN WORDS - ");
 795       else fprintf(stderr," UNKNOWN WORDS - ");
 796       if (LR_or_RL==LEFT_TO_RIGHT) fprintf(stderr,"LEFT TO RIGHT ]\n");
 797       else  fprintf(stderr,"RIGHT_TO_LEFT ]");
 798     }
 799
 800   //Construimos el mapping a partir de los ejemplos seleccionados
 801   mapping *m =  new mapping();
 802   m->mappingBuilt(f,MAX_MAPPING_SIZE,COUNT_CUT_OFF);
 803   generateFileName(NAME,"",numModel,LR_or_RL,K_or_U,"MAP",mapFileName);
 804   m->mappingWrite(mapFileName,FALSE);
 805
 806   //Creamos el depósito de pesos y el hash de sesgos
 807   hash_t *b = new hash_t;
 808   hash_init(b,30);
 809   weightRepository *wr = new weightRepository;
 810
 811   infoDict *pInfo;
 812   int nPositive=0,nNegative=0;
 813
 814   //Nos situamos en el primer  elemento de lista
 815   lPosToTrain->setFirst();
 816   //Para cada elemento de la lista de etiquetas
 817   for (int ret=1; ret>=0; ret=lPosToTrain->next())
 818   {
 819       if ( verbose == TRUE ) fprintf(stderr,"\n-----------------------------------------------------------");
 820
 821       //Obtenemos la etiqueta morfosintáctica
 822       pInfo = (infoDict *)lPosToTrain->getIndex();
 823       nPositive=0;
 824       nNegative=0;
 825
 826       //Preparamos el entrenamiento
 827       generateFileName(NAME,pInfo->txt,numModel,LR_or_RL,K_or_U,"POS",posFileName);
 828       generateFileName(NAME,pInfo->txt,numModel,LR_or_RL,K_or_U,"SVM",svmFileName);
 829
 830       //Seleccionamos los ejemplos para el entrenamiento para la POS que estamos viendo
 831       if ( verbose == TRUE ) fprintf(stderr,"\nPreparing training set for [ %s ] ..",pInfo->txt);
 832               learnerDressNakedSetTrain(d,m,f,pInfo->txt,numModel,LR_or_RL, K_or_U,&nPositive,&nNegative);
 833
 834       //Realizamos el entrenamiento llamando a SVM-light
 835       if ( verbose == TRUE ) fprintf(stderr,"\nTraining [ %s ] with %d samples: [+] = %d samples ; [-] = %d samples\n",pInfo->txt,nPositive+nNegative,nPositive,nNegative);
 836        learnerExecSVMlight(SVMDIR,options,posFileName,svmFileName);
 837
 838       //Se insertan los valores obtenidos del entrenamiento en el depósito de pesos
 839       //y el el hashing de sesgos
 840       if ( verbose == TRUE ) fprintf(stderr,"\nAdding elements to MERGED MODEL from [ %s ]",posFileName);
 841       wr = learnerBuiltWeightRepository(wr,m,pInfo->txt,svmFileName);
 842       if ( verbose == TRUE ) fprintf(stderr," [DONE]");
 843       if ( verbose == TRUE ) fprintf(stderr,"\nAdding biases from [ %s ]",posFileName);
 844       b =  learnerBuiltBias(b,pInfo->txt,svmFileName);
 845       if ( verbose == TRUE ) fprintf(stderr," [DONE]");
 846
 847       if (REMOVE_FILES == TRUE)
 848         {
 849           char cmd[150];
 850           sprintf(cmd,"rm -f %s",posFileName);
 851           system(cmd);
 852           sprintf(cmd,"rm -f %s",svmFileName);
 853           system(cmd);
 854         }
 855   }
 856   lPosToTrain->setFirst();
 857
 858   if ( verbose == TRUE ) fprintf(stderr,"\n-----------------------------------------------------------");
 859   char fileName[100];
 860   generateFileName(NAME,"",numModel,LR_or_RL,K_or_U,"MRG",fileName);
 861   if ( verbose == TRUE ) fprintf(stderr,"\nStoring MERGED MODEL [ %s ]",fileName);
 862   //Modificación 180705: Filtrado de pesos
 863   if ( K_or_U == KNOWN ) wr->wrWrite(fileName, KFILTER); //ADD 180705
 864   else wr->wrWrite(fileName, UFILTER); //ADD 180705
 865   //Escribir deposito de pesos en disco
 866   //wr->wrWrite(fileName); //DEL 180705
 867   if ( verbose == TRUE ) fprintf(stderr," [DONE]");
 868
 869   FILE *fwr = openFile(fileName,"a+");
 870   fprintf (fwr,"BIASES ");
 871   wr->wrWriteHash(b,fwr,' ');  //Escribir biases en fichero de depósito de pesos
 872   fclose(fwr);
 873
 874   generateFileName(NAME,"",numModel,LR_or_RL,K_or_U,"B",fileName);
 875   if ( verbose == TRUE ) fprintf(stderr,"\nStoring BIASES [ %s ]",fileName);
 876   FILE *fb =openFile(fileName,"w");
 877   wr->wrWriteHash(b,fb,'\n');  //Escribir biases en fichero de sesgos
 878   fclose(fb);
 879
 880   if ( verbose == TRUE ) fprintf(stderr," [DONE]");
 881
 882   delete m;
 883   delete wr;
 884   learnerDestroyBias(b);
 885 }
 886
 887 /*******************************************************/
 888 /*
 889  * Recibe como parámetro el nombre del fichero de entrenamiento
 890  * (trainingFileName), el diccionario para palabras conocidas (dKnown),
 891  * el número del modelo a entrenar (numModel), la dirección en la cual
 892  * se realiza el entrenamiento (direction), el número de frases del
 893  * corpus (numSent), el número de palabras del corpus (numWords) y
 894  * el número de fragmentos en que se ha de dividir el corpus para
 895  * entrenar palabras desconocidas (numChunks).
 896  *
 897  * Este método selecciona ejemplo y realiza el entrenamiento del modelo
 898  */
 899 void learner::learnerTrainModel(char *trainingFileName, dictionary *dKnown,int numModel, int direction, int numSent,int numWords, int numChunks)
 900 {
 901   FILE *fKnownRL,*fUnknownRL,*fUnknownLR,*fKnownLR;
 902   int contSentences=0,ret = 1;
 903   char name[200];
 904
 905   //Carga las listas de atributos
 906   simpleList featureList,featureListUnk;
 907   sprintf(name,"%s.A%d",NAME,numModel);
 908   createFeatureList(name,&featureList);
 909   sprintf(name,"%s.A%d.UNK",NAME,numModel);
 910   createFeatureList(name,&featureListUnk);
 911
 912   if (direction==LEFT_TO_RIGHT) learnerPrintMessage(numModel,-1,LEFT_TO_RIGHT,TRUE);
 913   else if (direction==RIGHT_TO_LEFT) learnerPrintMessage(numModel,-1,RIGHT_TO_LEFT,TRUE);
 914   else if (direction==LR_AND_RL) learnerPrintMessage(numModel,-1,LR_AND_RL,TRUE);
 915
 916   //Abrimos los ficheros de ejemplos
 917   if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL)
 918     {
 919       sprintf(name,"%s.M%d.RL.SAMPLES",NAME,numModel);
 920       fKnownRL=openFile(name,"w+");
 921       sprintf(name,"%s.UNK.M%d.RL.SAMPLES",NAME,numModel);
 922       fUnknownRL=openFile(name,"w+");
 923     }
 924   if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL)
 925     {
 926       sprintf(name,"%s.M%d.LR.SAMPLES",NAME,numModel);
 927       fKnownLR=openFile(name,"w+");
 928       sprintf(name,"%s.UNK.M%d.LR.SAMPLES",NAME,numModel);
 929       fUnknownLR=openFile(name,"w+");
 930     }
 931
 932   int chunkSize =  (numWords/numChunks) + 1;
 933
 934   if (verbose == TRUE)
 935     {
 936       fprintf(stderr,"\n* X = %f :: CHUNKSIZE = %d :: CHUNKS = %d",X,chunkSize,numChunks);
 937       fprintf(stderr,"\n* ========================================================================");
 938     }
 939
 940   int nWordsRL = 0, nWordsLR = 0;
 941   int inicioRL = -1, inicioLR = -1;
 942
 943   //Para cada chunk
 944   for (int i=0; (ret>=0 && i<numChunks);i++)
 945     {
 946       ret=1;
 947
 948       int is_end_of_chunk = FALSE;
 949       int is_end_of_sentence = FALSE;
 950       if ( verbose == TRUE ) fprintf(stderr,"\nChunk %d [ %d ] ",i+1,i*chunkSize);
 951
 952       //Creamos  el diccionario para entrenar palabras desconocidas
 953       dictionary *dUnknown = new dictionary(trainingFileName,i*chunkSize, (i+1)*chunkSize - 1 );
 954       if (R!=NULL) dUnknown->dictRepairFromFile(R);
 955       else dUnknown->dictRepairHeuristic(DRATIO);
 956       char name[200];
 957       sprintf(name,"%s.DICT.%d",NAME,i);
 958       dUnknown->dictWrite(name);
 959       delete dUnknown;
 960       dUnknown  = new dictionary(name);
 961
 962       //Si el modelo en 4 usaremos el mismo diccionario para conocidas y desconocidas
 963       dictionary *d_for_known = NULL;
 964       //if (numModel==4) dKnown = dUnknown;
 965       if ( numModel == 4 )
 966         {  d_for_known = dUnknown; }
 967       else { d_for_known = dKnown; }
 968
 969       if ( verbose ==  TRUE ) fprintf(stderr,"\nExtracting features : ");
 970
 971       nWordsLR = chunkSize;
 972       nWordsRL = chunkSize;
 973
 974       //Mientras haya palabras por leer y no estemos al final del chunk
 975       while ( ret>=0 && is_end_of_chunk==FALSE )
 976         {
 977           is_end_of_sentence = FALSE;
 978           //Si es LR o LRL
 979           if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL)
 980             {
 981               fKnown = fKnownLR;  fUnknown = fUnknownLR;
 982               //Recorremos el texto en sentido LR para seleccionar ejemplos
 983               nWordsLR = learnerLeftToRight(&featureList,&featureListUnk, d_for_known, dUnknown, nWordsLR, inicioLR);
 984               inicioLR = sw->getIndex()->ord;
 985               if (sw->next() !=  0)
 986                 {
 987                   is_end_of_sentence = TRUE;
 988                   inicioLR = -1;
 989                 }
 990             }
 991           //Si es RL o LRL
 992           if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL)
 993             {
 994               fKnown = fKnownRL; fUnknown = fUnknownRL;
 995               //Recorremos el texto en sentido RL para seleccionar ejemplos
 996               nWordsRL = learnerRightToLeft(&featureList,&featureListUnk, d_for_known , dUnknown, nWordsRL, inicioRL);
 997               inicioRL = sw->getIndex()->ord;
 998               if ( sw->previous() != 0 )
 999                 {
1000                   is_end_of_sentence = TRUE;
1001                   inicioRL = -1;
1002                 }
1003             }
1004
1005           contSentences++;
1006
1007           if ( verbose  == TRUE) showProcess(contSentences,0);
1008           //Si es fin de frase recargamos la ventana
1009           if ( is_end_of_sentence == TRUE )
1010             {
1011               sw->deleteList();
1012               ret = sw->iniGeneric();
1013             }
1014
1015           //Si hemos recorrido todas la palabras del chunk es fin de chunk
1016           if ( nWordsRL <= 0 || nWordsLR <= 0 )
1017             {
1018               is_end_of_chunk = TRUE;
1019             }
1020         }
1021       if ( verbose  == TRUE) showProcess(contSentences,1);
1022       delete dUnknown;
1023
1024       //Si es necesario borramos los ficheros temporales
1025       char cmd[200];
1026       sprintf(cmd, "rm -f %s",name);
1027       if (REMOVE_FILES == TRUE) system(cmd);
1028     }
1029
1030   //Creamos una copia de la lista de etiquetas para palabras conocidas
1031   simpleList *copyOfUNKP_L = learnerTransformHashInList(learnerUNKP_H);
1032   //para cada tipo known o unk
1033   //para cada direccion
1034   if (direction==LEFT_TO_RIGHT || direction==LR_AND_RL)
1035     {
1036       //Realizamos el entrenamiento
1037       learnerDoLearn(fKnownLR,numModel,LEFT_TO_RIGHT,KNOWN,dKnown,learnerAMBP_L);
1038       learnerDoLearn(fUnknownLR,numModel,LEFT_TO_RIGHT,UNKNOWN,dKnown,copyOfUNKP_L);
1039       fclose (fKnownLR);
1040       fclose (fUnknownLR);
1041     }
1042
1043   if (direction==RIGHT_TO_LEFT || direction==LR_AND_RL)
1044     {
1045       //Realizamos el entrenamiento
1046       learnerDoLearn(fKnownRL,numModel,RIGHT_TO_LEFT,KNOWN,dKnown,learnerAMBP_L);
1047       learnerDoLearn(fUnknownRL,numModel,RIGHT_TO_LEFT,UNKNOWN,dKnown,copyOfUNKP_L);
1048       fclose(fKnownRL);
1049       fclose(fUnknownRL);
1050     }
1051   //fin para cada direccion
1052   //fin para cada tipo
1053
1054   destroyFeatureList(&featureList,featureList.numElements());
1055   destroyFeatureList(&featureListUnk,featureListUnk.numElements());
1056 }
1057
1058 /**********************************************************/
1059
1060 /*
1061  * Recibe como parámetros: wrd un string con una palabra,
1062  * pos un string con una etiqueta, Known_or_Unknown si la
1063  * palabra es conocida o desconocida. Si la palabra es conocida
1064  * devuelve TRUE si la etiqueta pos está en la lista de etiquetas
1065  * morfosintácticas ambiguas. Si la palabra es desconocida
1066  * devuelve TRUE si la etiqueta pos pertenece a lista de posibles
1067  * categorías morfosintácticas para palabras desconocidas. En cualquier
1068  * otro caso devuelve FALSE.
1069  */
1070 int learner::learnerIsPossiblePOS(char *wrd, char *pos,int Known_or_Unknown)
1071 {
1072   if (Known_or_Unknown==KNOWN)
1073     {
1074         if (HASH_FAIL != hash_lookup(learnerAMBP_H,pos))
1075             {
1076                 return TRUE;
1077             }
1078     }
1079   else if (Known_or_Unknown==UNKNOWN)
1080     {
1081       if (HASH_FAIL != hash_lookup(learnerUNKP_H,pos))
1082       {
1083          return TRUE;
1084       }
1085     }
1086   return FALSE;
1087 }
1088
1089 /**************************************************************/
1090 /*
1091  * Recibe como parámetros:
1092  *      wrd un string con una palabra,
1093  *      Known_or_Unknown un entero que indica si la palabra
1094  *                       es conocida o desconocida y
1095  *      un apuntador d al diccionario. Devuelve un apuntador a una lista.
1096  */
1097 simpleList *learner::learnerGetPotser(char *wrd, int Known_or_Unknown, dictionary *d)
1098 {
1099   int stop=FALSE,ret=1;
1100   int w = d->getElement(wrd);
1101   infoDict *pInfoDict;
1102   simpleList *lout = new simpleList;
1103
1104   //Si es conocida recoge las posibles etiquetas que contiene el diccionario
1105   //para la palabra wrd  y mira si existen en la lista de categorías
1106   //morfosintácticas ambiguas. Si existen en la lista de etiquetas ambiguas
1107   //las añade a la lista de salida, en caso contrario si no esta es posible
1108   //etiqueta ambigua y es la etiqueta más frecuente se devuelve una lista
1109   //sólo con la etiqueta más frecuente.
1110   if (Known_or_Unknown==KNOWN)
1111     {
1112       if (w != HASH_FAIL)
1113         {
1114           simpleList *list = (simpleList *) d->getElementMaybe(w);
1115           list->setFirst();
1116           while (ret >= 0 && stop == FALSE)
1117             {
1118               pInfoDict = (infoDict *) list->getIndex();
1119
1120               if (HASH_FAIL!=hash_lookup(learnerAMBP_H,pInfoDict->txt))
1121                 {
1122                   infoDict *ptr = new infoDict;
1123                   strcpy(ptr->txt,pInfoDict->txt);
1124                   ptr->num = pInfoDict->num;
1125                   lout->add(ptr);
1126                 }
1127                 else if ( strcmp(d->getMFT(w),pInfoDict->txt)==0 ) //si es most frequent tag
1128                 {
1129                   d->dictCleanListInfoDict(lout,lout->numElements());
1130                   lout->deleteList();
1131                   infoDict *ptr = new infoDict;
1132                   strcpy(ptr->txt,pInfoDict->txt);
1133                   ptr->num = pInfoDict->num;
1134                   lout->add(ptr);
1135                   stop = TRUE;
1136                 }
1137               ret=list->next();
1138             }
1139           list->setFirst();
1140         }
1141     }
1142   //Si la palabra es desconocida la lista de salida contiene todas las
1143   //categorías morfosintácticas posibles desconocidas.
1144   else  if (Known_or_Unknown==UNKNOWN)
1145     {
1146          learnerUNKP_L->setFirst();
1147          while (ret>=0)
1148          {
1149            pInfoDict =(infoDict *) learnerUNKP_L->getIndex();
1150            infoDict *ptr = new infoDict;
1151            strcpy(ptr->txt,pInfoDict->txt);
1152            ptr->num = pInfoDict->num;
1153            lout->add(ptr);
1154
1155            ret=learnerUNKP_L->next();
1156          }
1157          learnerUNKP_L->setFirst();
1158       }
1159
1160   return lout;
1161 }
1162
1163
1164 /**********************************************************/
1165 /*
1166  * Recibe como parámetros:
1167  * un apuntador al depósito de pesos (wr),
1168  * un apuntador a mapping (m),
1169  * un string con la etiqueta (pos) que estamos entrenando y
1170  * el nombre del fichero  (fileName) en el cual se almacenan las SVs
1171  * para la etiqueta indicada.
1172  *
1173  * Partiendo de estos datos recorre el fichero fileName,  traduciendo
1174  * cada uno de los atributos leídos gracias al mapping (m).
1175  * Y añadiendo la correspondiente pareja etiqueta/atributo
1176  * al depósito de pesos wr mediante el método wrAdd.
1177  *
1178  * Se devuelve el apuntador al depósito de pesos (weightRepository)
1179  * que ha sido modificado.
1180  */
1181 weightRepository *learner::learnerBuiltWeightRepository(weightRepository *wr,mapping *m,char *pos,char *fileName)
1182 {
1183   char str[200]="";
1184   char *key;
1185   FILE *f = openFile(fileName,"r");
1186   int ret=1,trobat=FALSE;
1187
1188   while (!feof(f) && ret>=0)
1189     {
1190       if (trobat==FALSE)
1191         {
1192           ret= readTo(f,'\n',0,str);
1193           if (strstr(str,"threshold")!=NULL) trobat=TRUE;
1194         }
1195       else
1196         {
1197           memset(str,0,strlen(str));
1198           ret = readTo(f,' ',0,str);
1199           long double ld;
1200
1201           ld  = atof(str);
1202
1203           while (ret>0)
1204             {
1205               ret = readTo(f,':','\n',str);
1206               if (ret>0)
1207                 {
1208                   key = m->mappingGetFeatureByNumber(str);
1209                   ret = readTo(f,' ','\n',str);
1210                   if ((uintptr_t)key != HASH_FAIL) wr->wrAdd(key,pos,ld);
1211                 }
1212             }
1213
1214         }
1215     }
1216   fclose (f);
1217   return wr;
1218 }
1219
1220 /**************************************************************/
1221
1222 void learner::learnerDestroyBias(hash_t *h)
1223 {
1224   weight_node_t *aux;
1225
1226   hash_t *tptr = h;
1227   hash_node_t **old_bucket, *old_hash, *tmp;
1228   int old_size;
1229
1230   old_bucket=tptr->bucket;
1231   old_size=tptr->size;
1232
1233   //Recorre cada lista de sinónimos del hash eliminando su contenido
1234   for (int i=0; i<old_size; i++)
1235     {
1236       old_hash=old_bucket[i];
1237       while(old_hash)
1238         {
1239           tmp=old_hash;
1240           old_hash=old_hash->next;
1241
1242           aux = (weight_node_t *) tmp->data;
1243
1244           delete aux;
1245           aux = NULL;
1246         } /* while */
1247     } /* for */
1248
1249   hash_destroy(h);
1250 }
1251
1252 /**************************************************************/
1253
1254 /*
1255  * Recibe como parámetros:
1256  * Un apuntador a hash_t (h) que es el lugar donde almacenaremos los sesgos,
1257  * la etiqueta (pos) que estamos entrenando y el fichero (fileName) de salida
1258  * de SVM-light en el cual se encuentran los datos esperados.
1259  * Este método lee el sesgo del fichero y lo añade (hash_insert)
1260  * al hashing h para la etiqueta pos indicada.
1261  * Posteriormente devuelve el apuntador al hashing modificado.
1262  */
1263 hash_t *learner::learnerBuiltBias(hash_t *h,char *pos,char *fileName)
1264 {
1265   char str[200]="";
1266   FILE *f = openFile(fileName ,"r");
1267   int ret=1,trobat=FALSE;
1268
1269   while (!feof(f) && trobat==FALSE && ret>=0)
1270     {
1271       ret= readTo(f,'\n',0,str);
1272       if (strstr(str,"threshold")!=NULL) trobat=TRUE;
1273     }
1274
1275   char bias[200];
1276   int sortir = FALSE;
1277   strcpy(bias,"");
1278   for (int i=0;(i<strlen(str) && sortir==FALSE) ;i++)
1279     {
1280       if (str[i]==' ' || str[i]=='#') sortir == TRUE;
1281       else sprintf(bias,"%s%c",bias,str[i]);
1282     }
1283
1284   weight_node_t *w = new weight_node_t;
1285   strcpy(w->pos,pos);
1286   w->data = (long double)0;
1287   w->data = atof (bias);
1288   hash_insert(h,w->pos,(uintptr_t)w);
1289
1290   fclose(f);
1291   return h;
1292 }
1293
1294 /********************************************************/
1295
1296 /*
1297  * Recibe como parámetro el nombre del fichero de configuración (train).
1298  * Ejecuta el aprendizaje
1299  */
1300 void learner::learnerRun(char *train)
1301 {
1302   int iWrd=0,iSent=0;
1303
1304   init_stack(&DO);
1305   //Leemos el fichero de configuración
1306   read_config_file(train);
1307   //obtener tamaño del corpus
1308   learnerCount(TRAINSET,&iWrd,&iSent);
1309
1310   if ( verbose == TRUE )
1311     {
1312       fprintf(stderr,"\n* trainset # words        = [ %d ]",iWrd);
1313       fprintf(stderr,"\n* trainset # sentences    = [ %d ]",iSent);
1314       fprintf(stderr,"\n* ========================================================================");
1315       fprintf(stderr,"\n\n* ========================================================================");
1316       fprintf(stderr,"\n* PREPARING TRAINING");
1317       fprintf(stderr,"\n* ========================================================================");
1318     }
1319   struct  tms tbuff1,tbuff2;
1320   clock_t start,end;
1321   start = times(&tbuff1);
1322
1323   //Creamos el diccionario
1324   char name[200];
1325   sprintf(name,"%s.DICT",NAME);
1326   dictionary *d = new dictionary(TRAINSET, 0,0);
1327   if (R!=NULL) d->dictRepairFromFile(R); // "/mnt/hda4/pfc/WSJ.200");
1328         else d->dictRepairHeuristic(DRATIO);
1329   d->dictWrite(name);
1330   delete d;
1331   d  = new dictionary(name);
1332
1333   //Obtenemos las listas de etiquetas
1334   if (learnerAMBP_H == NULL)
1335     learnerAMBP_H = d->dictFindAmbP(&learnerNumAMBP);
1336   if (learnerUNKP_H == NULL)
1337     learnerUNKP_H = d->dictFindUnkP(&learnerNumUNKP);
1338   //Creamos los ficheros de etiquetas
1339   learnerCreatePOSFile(NAME,TRUE,learnerAMBP_H);
1340   learnerCreatePOSFile(NAME,FALSE,learnerUNKP_H);
1341   //Creamos unos hashings con las listas de etiquetas
1342   learnerAMBP_L = learnerTransformHashInList(learnerAMBP_H);
1343   learnerUNKP_L = learnerTransformHashInList(learnerUNKP_H);
1344
1345   //creamos el fichero de configuración de la ventana
1346   learnerCreateDefaultFile(NAME,"WIN");
1347
1348   //Calculamos en numero de chunks
1349   int chunks = learnerNumChunks(TRAINSET,X,iSent);
1350   //Mientras haya modelos por entrenar
1351   while (!empty(&DO))
1352     {
1353       int *numModel = (int *) pop(&DO);
1354       int *direction = (int *) pop(&DO);
1355
1356         //Eliminamos los ficheros anteriores
1357       removeFiles(NAME, RM_MODEL_FILES,*numModel, *direction, verbose);
1358       removeFiles(NAME, RM_TEMP_FILES ,*numModel, *direction, verbose);
1359
1360       sprintf(name,"A%d",*numModel);
1361       learnerCreateDefaultFile(NAME,name);
1362
1363         //Creamos la ventana
1364       FILE *f = fopen(TRAINSET, "r");
1365       sw = new swindow(f,WINDOW_SIZE,CORE_POSITION);
1366       //ejecutamos el entrenamiento para el modelo
1367       learnerTrainModel(TRAINSET,d,*numModel,*direction,iSent,iWrd,chunks);
1368       delete sw;
1369     }
1370   delete d;
1371
1372   end = times(&tbuff2);
1373   if ( verbose == TRUE )
1374     {
1375       fprintf(stderr,"\n\n* ========================================================================\n");
1376       showTime ("* SVM-light Time",  time_svmlight, time_svmlight, 0);
1377       showTime ("* SVMTlearn Time",
1378                 ((double)(end-start))/CLOCKS_PER_SECOND - time_svmlight, //CLK_TCK,
1379                 ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime)/CLOCKS_PER_SECOND,
1380                 ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime)/CLOCKS_PER_SECOND);
1381       showTime ("* Total Learning Time",
1382                 time_svmlight + ((double)(end-start))/CLOCKS_PER_SECOND,
1383                 time_svmlight + ((double)tbuff2.tms_utime-(double)tbuff1.tms_utime)/CLOCKS_PER_SECOND,
1384                 ((double)tbuff2.tms_stime-(double)tbuff1.tms_stime)/CLOCKS_PER_SECOND);
1385       fprintf(stderr,"* ========================================================================\n\n");
1386     }
1387 }
1388
1389 /**************************************************************/
1390
1391 /*
1392  * Recorre el texto de izquierda a derecha seleccionando ejemplos
1393  */
1394 int learner::learnerLeftToRight(simpleList *featureList, simpleList *featureListUnk, dictionary *dKnown, dictionary *dUnknown, int numWrds, int inicio)
1395 {
1396   int ret = 0;
1397
1398   if ( inicio == -1 ) while (sw->previous()==0);
1399   else if (sw->getIndex()->ord!=inicio)
1400     {
1401       while(ret == 0 && sw->getIndex()->ord != inicio)
1402       {
1403         if (inicio < sw->getIndex()->ord ) ret = sw->previous();
1404         if (inicio > sw->getIndex()->ord ) ret = sw->next();
1405       }
1406     }
1407   nodo *elem = sw->getIndex();
1408   numWrds--;
1409   learnerGenerateFeatures(elem,featureList,dKnown, LEFT_TO_RIGHT);
1410   learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown, dUnknown, LEFT_TO_RIGHT);
1411
1412   while(numWrds>=0)
1413     {
1414       if ( sw->next() != 0 ) return numWrds;
1415
1416       elem = sw->getIndex();
1417       numWrds--;
1418       learnerGenerateFeatures(elem,featureList,dKnown,LEFT_TO_RIGHT);
1419       learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown, dUnknown, LEFT_TO_RIGHT);
1420     }
1421
1422    return (numWrds);
1423 }
1424
1425 /**************************************************************/
1426
1427 /*
1428  * Recorre el texto de derecha a izquierda seleccionando ejemplos
1429  */
1430 int learner::learnerRightToLeft(simpleList *featureList, simpleList *featureListUnk, dictionary *dKnown, dictionary *dUnknown, int numWrds, int inicio)
1431 {
1432   int ret = 0;
1433
1434   if ( inicio == -1 ) while (sw->next()==0);
1435   else  if ( sw->getIndex()->ord != inicio )
1436     {
1437      while(ret == 0 && sw->getIndex()->ord != inicio)
1438       {
1439         if (inicio < sw->getIndex()->ord ) ret = sw->previous();
1440         if (inicio > sw->getIndex()->ord ) ret = sw->next();
1441       }
1442     }
1443
1444   nodo *elem = sw->getIndex();
1445   numWrds--;
1446   learnerGenerateFeatures(elem,featureList,dKnown,RIGHT_TO_LEFT);
1447   learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown,dUnknown,RIGHT_TO_LEFT);
1448
1449   while( numWrds>=0 )
1450     {
1451       if ( sw->previous() != 0 ) return numWrds;
1452
1453       elem = sw->getIndex();
1454       numWrds--;
1455       learnerGenerateFeatures(elem,featureList,dKnown, RIGHT_TO_LEFT);
1456       learnerGenerateFeaturesUnk(elem,featureListUnk,dKnown, dUnknown, RIGHT_TO_LEFT);
1457     }
1458
1459   return numWrds;
1460 }
1461
1462 /**************************************************************/
1463 /*
1464  * Esta función recibe como parámetros:
1465  * el apuntador a un nodo de la ventana (elem),
1466  * una pila donde apilara los atributos generados (stk),
1467  * la lista de atributos que debe generar (featureList),
1468  * el diccionario con la información necesaria para el cálculo de features (d)
1469  * y  la dirección en que se recorre el corpus (direction).
1470  * Recorre la lista featureList y ejecuta los métodos necesarios
1471  * de la ventana (swindow) para generar los atributos y que al final
1472  * de la ejecución de este método esten apilados en stk.
1473 */
1474 void learner::learnerGetFeatures(nodo *elem, stack_t *stk,dictionary *d, simpleList *featureList, int direction)
1475 {
1476   nodo_feature_list *aux = NULL;
1477   int ret = 1;
1478   //Recorre la lista de atributos y crea los atributos correspondientes
1479   while (ret>=0)
1480     {
1481       aux = (nodo_feature_list *) featureList->getIndex();
1482       if (strcmp(aux->mark,SLASTW)==0)  sw->winPushSwnFeature(stk);
1483       else if (strcmp(aux->mark,WMARK)==0)  sw->winPushWordFeature((void *)aux,d,stk,direction);
1484       else if (strcmp(aux->mark,KMARK)==0)  sw->winPushAmbiguityFeature((void *)aux,d,stk,direction);
1485       else if (strcmp(aux->mark,MMARK)==0)  sw->winPushMaybeFeature((void *)aux,d,stk,direction);
1486       else if (strcmp(aux->mark,PMARK)==0)  sw->winPushPosFeature((void *)aux,d,stk,direction);
1487       else if (strcmp(aux->mark,MFTMARK)==0)  sw->winPushMFTFeature((void *)aux,d,stk,direction);
1488       else
1489         {
1490           int *param;
1491           if (aux->n>0)
1492           {
1493             param = (int *) aux->l.getIndex();
1494           }
1495           if (strcmp(aux->mark,PREFIX_MARK)==0)  sw->winPushPrefixFeature(elem->wrd, stk, *param);
1496           else if (strcmp(aux->mark,SUFFIX_MARK)==0) sw->winPushSuffixFeature(elem->wrd, stk, *param);
1497           else if (strcmp(aux->mark,CHAR_A_MARK)==0) sw->winPushLetterFeature(elem->wrd, stk, *param, COUNTING_FROM_BEGIN);
1498           else if (strcmp(aux->mark,CHAR_Z_MARK)==0) sw->winPushLetterFeature(elem->wrd, stk, *param, COUNTING_FROM_END);
1499           else if (strcmp(aux->mark,LENGTH_MARK)==0) sw->winPushLenghtFeature(elem->wrd,stk);
1500           else if (strcmp(aux->mark,START_CAPITAL_MARK)==0) sw->winPushStartWithCapFeature(elem->wrd,stk);
1501           else if (strcmp(aux->mark,START_LOWER_MARK)==0)  sw->winPushStartWithLowerFeature(elem->wrd,stk);
1502           else if (strcmp(aux->mark,START_NUMBER_MARK)==0) sw->winPushStartWithNumberFeature(elem->wrd,stk);
1503           else if (strcmp(aux->mark,ALL_UPPER_MARK)==0) sw->winPushAllUpFeature(elem->wrd,stk);
1504           else if (strcmp(aux->mark,ALL_LOWER_MARK)==0) sw->winPushAllLowFeature(elem->wrd,stk);
1505           else if (strcmp(aux->mark,CONTAIN_CAP_MARK)==0) sw->winPushContainCapFeature(elem->wrd, stk);
1506           else if (strcmp(aux->mark,CONTAIN_CAPS_MARK)==0) sw->winPushContainCapsFeature(elem->wrd, stk);
1507           else if (strcmp(aux->mark,CONTAIN_COMMA_MARK)==0) sw->winPushContainCommaFeature(elem->wrd, stk);
1508           else if (strcmp(aux->mark,CONTAIN_NUMBER_MARK)==0) sw->winPushContainNumFeature(elem->wrd, stk);
1509           else if (strcmp(aux->mark,CONTAIN_PERIOD_MARK)==0) sw->winPushContainPeriodFeature(elem->wrd, stk);
1510           else if (strcmp(aux->mark,MULTIWORD_MARK)==0) sw->winPushMultiwordFeature(elem->wrd, stk);
1511         }
1512       ret = featureList->next();
1513     }
1514   featureList->setFirst();
1515 }
1516
1517 /**************************************************************/
1518
1519 /*
1520  * El objetivo de este método es seleccionar o descartar una palabra para
1521  * realizar en el entrenamiento de palabras desconocidas, calcular respectivos
1522  * atributos e insertar esta información en el fichero de ejemplos correspondiente.
1523  */
1524 void learner::learnerGenerateFeaturesUnk(nodo *elem, simpleList *featureList,dictionary *d, dictionary *dUnk, int direction)
1525 {
1526   stack_t stk;
1527   nodo_feature_list *aux;
1528   int is_selected = FALSE;
1529   char *feature = NULL;
1530   char buffer[1000];
1531
1532   strcpy(buffer,"");
1533
1534   if (d==NULL || elem==NULL || featureList==NULL) return;
1535
1536   init_stack(&stk);
1537
1538   sprintf(buffer,"%s:%s",elem->wrd,elem->comment);
1539
1540   int i = dUnk->getElement(elem->wrd);
1541   if (i!=HASH_FAIL)
1542     {
1543       /*
1544       int i2 = d->getElement(elem->wrd);
1545       if (dUnk->getElementNumMaybe(i) == 1 && hash_lookup(learnerUNKP_H,d->getMFT(i2))!=HASH_FAIL )
1546       {
1547             fprintf(fUnknown,buffer,strlen(buffer));
1548             is_selected = TRUE;
1549       }
1550     */
1551     }
1552   else
1553     {
1554           fprintf(fUnknown,buffer,strlen(buffer));
1555           is_selected = TRUE;
1556     }
1557
1558   if ( is_selected == TRUE) learnerGetFeatures(elem, &stk,dUnk, featureList, direction );
1559
1560   while (!empty(&stk) && is_selected == TRUE)
1561     {
1562       feature = (char *) pop(&stk);
1563       sprintf(buffer," %s",feature);
1564       fprintf(fUnknown,buffer,strlen(buffer)+1);
1565       delete feature;
1566     }
1567
1568   if (is_selected == TRUE) fprintf(fUnknown,"\n");
1569
1570   strcpy(elem->pos,elem->comment);
1571  }
1572
1573 /**************************************************************/
1574
1575 /*
1576  * El objetivo de este método es seleccionar o descartar una palabra para
1577  * realizar en el entrenamiento de palabras conocidas, calcular respectivos
1578  * atributos e insertar esta información en el fichero de ejemplos correspondiente.
1579  */
1580 void learner::learnerGenerateFeatures(nodo *elem, simpleList *featureList,dictionary *d, int direction)
1581 {
1582   stack_t stk;
1583   nodo_feature_list *aux;
1584   int is_selected = FALSE;
1585   int is_unk = FALSE;
1586   char *feature,buffer[1000]="";
1587
1588   if (d==NULL || elem==NULL || featureList==NULL) return;
1589
1590   init_stack(&stk);
1591
1592   sprintf(buffer,"%s:%s",elem->wrd,elem->comment);
1593
1594   int i = d->getElement(elem->wrd);
1595   if (i!=HASH_FAIL)
1596     {
1597
1598       if ( d->getElementNumMaybe(i)>1 && hash_lookup(learnerAMBP_H,d->getMFT(i))!=HASH_FAIL )
1599         {
1600           fprintf(fKnown,buffer,strlen(buffer));
1601           is_selected = TRUE;
1602         }
1603
1604     }
1605
1606   if ( is_selected == TRUE) learnerGetFeatures(elem, &stk,d, featureList, direction );
1607
1608   while (!empty(&stk) && is_selected == TRUE)
1609     {
1610       feature = (char *) pop(&stk);
1611       sprintf(buffer," %s",feature);
1612       /*  if (i!=HASH_FAIL)
1613           {*/
1614           fprintf(fKnown,buffer,strlen(buffer)+1);
1615
1616       delete feature;
1617     }
1618
1619   sprintf(buffer,"\n");
1620
1621   if ( is_selected == TRUE)     fprintf(fKnown,buffer,strlen(buffer));
1622
1623   strcpy(elem->pos,elem->comment);
1624 }
1625
1626 /************************************************************/
1627 /*
1628  * Ejecuta SVM-light. Recibe como parámetros cuatro cadenas de
1629  * caracteres: svmdir es el directorio en el que se encuentra SVM-light,
1630  * options son las opciones con que se lanzará SVM-light, posFile el nombre
1631  * del fichero de ejemplos usado como entrada para la herramienta
1632  * de Joachims y, por último, outFile que es el nombre del fichero
1633  * de salida.  Esta función devuelve 0.
1634 */
1635 int learner::learnerExecSVMlight(char *svmdir, char *options, char *posFile, char *outFile)
1636 {
1637   time_t begin, finish;
1638
1639   begin = time (0);
1640
1641   char command[500]="";
1642   float c;
1643   strcpy(command,"");
1644   sprintf(command,"%s/svm_learn -v 0  %s %s %s",svmdir,options,posFile,outFile);
1645
1646   if ( verbose == TRUE ) fprintf(stderr,"Executing Joachims svm_light [ with options: %s ]  ",options);
1647   system(command);
1648   if ( verbose == TRUE ) fprintf(stderr," [DONE]");
1649
1650   finish = time(0);
1651
1652   time_svmlight = difftime(finish,begin) +  time_svmlight;
1653
1654   return 0;
1655 }
1656
1657 /**************************************************************/
1658
1659 simpleList *learner::learnerTransformHashInList(hash_t *tptr)
1660 {
1661   hash_node_t *node, *last;
1662   int i;
1663   simpleList *l = new simpleList;
1664
1665   for (i=0; i<tptr->size; i++)
1666     {
1667       node = tptr->bucket[i];
1668       while (node != NULL)
1669         {
1670           last = node;
1671           node = node->next;
1672           infoDict *p = (infoDict *) last->data;
1673           l->add( (void *)last->data);
1674         }
1675     }
1676   l->setFirst();
1677   return l;
1678 }
1679
1680 /**************************************************************/
1681
1682 /*
1683  * Calcularemos el número de fragmentos en el que se ha de dividir
1684  * el corpus para conseguir un porcentaje de palabras desconocidas
1685  * determinado. Lo parámetros de entrada son el nombre del fichero
1686  * de entrenamiento (trainingFileName), el porcentaje de palabras
1687  * desconocidas deseado (percentage) y el número de frases del corpus
1688  * (nSentences). El valor devuelto es un entero indicando el número de
1689  * fragmentos. Si el número de fragmentos calculado es mayor que el número
1690  * de frases que contiene el corpus de entrenamiento se devolverá como el
1691  * número de frases (nSentences).
1692  */
1693 int learner::learnerNumChunks(char *trainingFileName,float percentage,int nSentences)
1694 {
1695   int ret=0,ndwords=0,nwords=0;
1696   char wrd[500];
1697
1698   FILE *f = openFile (trainingFileName,"r");
1699   hash_t h1;
1700   hash_init(&h1,10000);
1701   while (!feof(f))
1702     {
1703       ret = readTo(f,' ','\n',wrd);
1704       if (ret>=0)
1705         {
1706           nwords++;
1707           char *w = new char[strlen(wrd)+1];
1708           strcpy(w,wrd);
1709           if ((uintptr_t)hash_insert(&h1,w,(uintptr_t) w)==HASH_FAIL) ndwords++;
1710           if (ret>0) readTo(f,'\n','\n',wrd);
1711         }
1712     }
1713
1714    //Read again until a certain point where X is met   --> $ndwords * (100 - $X) / 100;
1715    float meeting = ndwords * (100 - X ) /100;
1716    int nwords2=0,ndwords2=0;
1717    fseek(f,0,SEEK_SET);
1718    hash_t h2;
1719    hash_init(&h2,10000);
1720    while (!feof(f) && ndwords2<meeting)
1721     {
1722       ret = readTo(f,' ','\n',wrd);
1723       if (ret>=0)
1724         {
1725           nwords2++;
1726           char *w = new char[strlen(wrd)+1];
1727           strcpy(w,wrd);
1728           if (hash_insert(&h2,w,(uintptr_t) w)==HASH_FAIL) ndwords2++;
1729           if (ret>0) readTo(f,'\n','\n',wrd);
1730         }
1731     }
1732
1733    int chunks = nwords/(nwords - nwords2);
1734    if (nSentences<=chunks) chunks = nSentences;
1735
1736    fclose (f);
1737
1738    hash_destroy(&h2);
1739    hash_destroy(&h1);
1740    return chunks;
1741 }
1742
1743 /**************************************************************/
1744
1745 int learner::learnerIsInsideList(simpleList *l, char *key)
1746 {
1747   if (l==NULL || key==NULL || strcmp(key,"")==0) return FALSE;
1748   int ret = 0;
1749   while (ret>=0)
1750     {
1751       infoDict *ptr = (infoDict *)l->getIndex();
1752       if (strcmp(key,ptr->txt)==0)
1753        {
1754          return TRUE;
1755        }
1756     }
1757   l->setFirst();
1758   return FALSE;
1759 }