src/dict.cc

   1 /*
   2  * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17  */
  18
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21 #include <string.h>
  22 #include "hash.h"
  23 #include "list.h"
  24 #include "dict.h"
  25 #include "swindow.h"
  26 #include "common.h"
  27 #include "er.h"
  28
  29
  30 extern int verbose_svmtool;
  31
  32 /**************************************************/
  33
  34 char *dictionary::getMFT(int w)
  35 {
  36         if (w==HASH_FAIL) return NULL;
  37
  38         int max=0,ret = 1;
  39         char *mft = new char[TAM_POS];
  40         simpleList *l = this->getElementMaybe(w);
  41         infoDict *ptr;
  42
  43         l->setFirst();
  44         while (ret>=0)
  45         {
  46                 ptr = (infoDict *)l->getIndex();
  47                 if (max<ptr->num)
  48                 { strcpy(mft,ptr->txt);
  49                         max = ptr->num;
  50                 }
  51                 ret = l->next();
  52         }
  53         l->setFirst();
  54 return mft;
  55 }
  56
  57 /**************************************************/
  58
  59 char *dictionary::getAmbiguityClass(int w)
  60 {
  61         char *amb = new char[200];
  62         if (w==HASH_FAIL)
  63         {
  64          sprintf(amb,"UNKNOWN");
  65          return amb;
  66         }
  67
  68         int ret = 1;
  69         strcpy(amb,"");
  70         simpleList *l = this->getElementMaybe(w);
  71         int numMaybe = this->getElementNumMaybe(w);
  72         infoDict *ptr;
  73
  74         l->setFirst();
  75         while (ret>=0)
  76         {
  77           numMaybe--;
  78           ptr = (infoDict *)l->getIndex();
  79           // fprintf(stderr," %s %d",ptr->txt,ptr->num);
  80           if (numMaybe>0) sprintf(amb,"%s%s_",amb,ptr->txt);
  81           else sprintf(amb,"%s%s",amb,ptr->txt);
  82           ret = l->next();
  83         }
  84         l->setFirst();
  85         return amb;
  86 }
  87
  88 /**************************************************/
  89
  90 void dictionary::dictIncInfo(dataDict *elem, char *pos)
  91 {
  92   int ret=1;
  93   infoDict *pInfoDict;
  94
  95   elem->numWrd++;
  96   while (ret>=0)
  97     {
  98       pInfoDict = (infoDict *) elem->maybe.getIndex();
  99       if (strcmp(pInfoDict->txt,pos)==0)
 100         {
 101           pInfoDict->num++;
 102           elem->maybe.setFirst();
 103           return;
 104         }
 105       ret=elem->maybe.next();
 106     }
 107   pInfoDict = new infoDict;
 108   strcpy(pInfoDict->txt,pos);
 109   pInfoDict->num=1;
 110   elem->maybe.add(pInfoDict);
 111   elem->numMaybe++;
 112   elem->maybe.setFirst();
 113 }
 114
 115 /**************************************************/
 116
 117 void dictionary::dictWrite(char *outName)
 118 {
 119   int ret=0;
 120   infoDict *data;
 121   dataDict *aux;
 122   int cont=0,contWords=0;
 123   char stringPOS[1000];
 124
 125   FILE *f = openFile(outName,"w");
 126
 127   hash_t *tptr = &d;
 128
 129   hash_node_t **old_bucket, *old_hash, *tmp;
 130   int old_size, h, i;
 131
 132   old_bucket=tptr->bucket;
 133   old_size=tptr->size;
 134
 135
 136   for (i=0; i<old_size; i++)
 137     {
 138       old_hash=old_bucket[i];
 139       while(old_hash)
 140         {
 141           tmp=old_hash;
 142           old_hash=old_hash->next;
 143
 144           aux = (dataDict *) tmp->data;
 145           //fprintf(stderr,"\n%s %d %d",aux->wrd,aux->numWrd,aux->numMaybe);
 146           fprintf(f,"%s %d %d",aux->wrd,aux->numWrd,aux->numMaybe);
 147
 148           cont++;
 149           contWords = aux->numWrd+contWords;
 150           ret  = 1;
 151           strcpy(stringPOS,"");
 152           while (aux->numMaybe>0 && ret>=0)
 153             {
 154               data = (infoDict *) aux->maybe.getIndex();
 155               //fprintf(stderr," [ %s %d ]",data->txt,data->num);
 156
 157               if (strlen(stringPOS)==0) sprintf(stringPOS,"%s %d",data->txt,data->num);
 158               else sprintf(stringPOS,"%s %s %d",stringPOS,data->txt,data->num);
 159
 160               ret=aux->maybe.next();
 161             }
 162
 163           char *szOut = new char[strlen(stringPOS)+1];
 164           ordenarStringPorParejas(stringPOS, szOut, 0, stringPOS);
 165           fprintf(f," %s\n",szOut);
 166
 167           delete szOut;
 168         } /* while */
 169     } /* for */
 170
 171   fclose(f);
 172   return;
 173 }
 174
 175 /**************************************************/
 176
 177 void dictionary::dictCreate(FILE *f,int limitInf,int limitSup)
 178 {
 179   int retW=0,retP=0,contWords=0,cont=0,contWordsAdded=0;
 180   infoDict *data;
 181   dataDict *aux,*aux2;
 182   nodo *elem;
 183   char wrd[200],pos[10];
 184   int no_chunk = FALSE;
 185
 186   if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nCreating Dictionary");
 187
 188   if (limitInf == 0 && limitSup == 0) no_chunk = TRUE;
 189
 190   while (retP>=0 && retW>=0)
 191     {
 192       if ( verbose_svmtool  == TRUE) showProcessDone(contWordsAdded, 1000, FALSE,"");
 193       retW = readString(f, wrd);
 194       char *real  = new char [strlen(wrd)+1];
 195       strcpy(real,wrd);
 196       retP = readString(f, pos);
 197
 198       if (retW>=0 && retP>=0)
 199         {
 200           int erRet=erLookRegExp(wrd);
 201           switch (erRet)
 202            {
 203                 case CARD: strcpy(wrd,"@CARD"); break;
 204                 case CARDSEPS: strcpy(wrd,"@CARDSEPS"); break;
 205                 case CARDPUNCT: strcpy(wrd,"@CARDPUNCT"); break;
 206                 case CARDSUFFIX: strcpy(wrd,"@CARDSUFFIX"); break;
 207            }
 208
 209           int is_valid_for_limit_inf =  ( (contWords < limitInf)  || contWords == 0 );
 210           int is_valid_for_limit_sup =  (contWords >= limitSup );
 211
 212           if ( no_chunk == TRUE || is_valid_for_limit_inf || is_valid_for_limit_sup )
 213             {
 214               if ((uintptr_t)(aux=(dataDict *)hash_lookup(&d,wrd)) == HASH_FAIL)
 215                 {
 216                   aux= new dataDict;
 217                   strcpy(aux->wrd,wrd);
 218                   aux->numMaybe = 1;
 219                   aux->numWrd = 1;
 220                   data = new infoDict;
 221                   strcpy(data->txt,pos);
 222                   data->num=1;
 223                   aux->maybe.add(data);
 224                   hash_insert(&d,aux->wrd,(uintptr_t) aux);
 225                   cont++;
 226                 }
 227               else dictIncInfo(aux,pos);
 228               contWordsAdded++;
 229
 230               if (strcmp(wrd,"@CARD")==0 || strcmp(wrd,"@CARDPUNCT")==0
 231                   || strcmp(wrd,"@CARDSEPS")==0 ||  strcmp(wrd,"@CARDSUFFIX")==0)
 232                 {
 233                   if ((uintptr_t)(aux2=(dataDict *)hash_lookup(&d,real)) == HASH_FAIL)
 234                   {
 235                     aux2 = new dataDict;
 236                     strcpy(aux2->wrd,real);
 237                     aux2->numMaybe = 1;
 238                     aux2->numWrd = 1;
 239                     data = new infoDict;
 240                     strcpy(data->txt,pos);
 241                     data->num = 1;
 242                     aux2->maybe.add(data);
 243                     hash_insert(&d,aux2->wrd,(uintptr_t) aux2);
 244                     cont++;
 245                   }
 246                   else dictIncInfo(aux2,pos);
 247                 }
 248             }
 249         }
 250         contWords++;
 251         delete real;
 252     }
 253
 254    if ( verbose_svmtool == TRUE ) fprintf(stderr,"[ %d words ]",cont);
 255 }
 256
 257 /**************************************************/
 258
 259 void dictionary::dictRepairFromFile(char *fileName)
 260 {
 261   if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nReparing Dictionary with file < %s >",fileName);
 262   FILE *f = openFile(fileName,"r");
 263
 264   char wrd[250],pos[10];
 265   int numWrd,numMaybe,numWrdxPOS;
 266   dataDict *aux;
 267
 268
 269   // Bucle para leer lista de palabras
 270   while (!feof(f))
 271     {
 272       fscanf(f,"%s %d %d",wrd,&numWrd,&numMaybe);
 273       int w = hash_lookup(&d,wrd);
 274       if (w!=HASH_FAIL)
 275       {
 276          aux = new dataDict;
 277          strcpy(aux->wrd,wrd);
 278          aux->numWrd = getElementNumWord(w);
 279          aux->numMaybe = 0;
 280
 281          simpleList *l = getElementMaybe(w);
 282
 283          for (int i=0;i<numMaybe;i++)
 284          {
 285                 fscanf(f,"%s %d",pos,&numWrdxPOS);
 286                 int ret=0;
 287
 288                 while (ret>=0)
 289                 {
 290                   infoDict *ptr = (infoDict *)l->getIndex();
 291                   if (strcmp(pos,ptr->txt)==0)
 292                      {
 293                        //Copiamos  elemento a añadir
 294                        infoDict *tmpInfoDict = new infoDict;
 295                        strcpy(tmpInfoDict->txt,ptr->txt);
 296                        tmpInfoDict->num = ptr->num;
 297
 298                        aux->maybe.add(tmpInfoDict);
 299                        aux->numMaybe++;
 300                        ret = -1;
 301                        delete ptr; //Borrar substituido
 302                       }
 303                   else ret = l->next();
 304
 305                 }
 306             l->setFirst();
 307          }
 308
 309         delete (dataDict *) hash_delete (&d,wrd);
 310         hash_insert(&d,aux->wrd,(uintptr_t) aux);
 311       }
 312    }
 313    fclose(f);
 314 }
 315
 316 /**************************************************/
 317
 318 void dictionary::dictRepairHeuristic(float dratio)
 319 {
 320   hash_t *tptr = &d;
 321   hash_node_t *node, *last;
 322   int i;
 323
 324   for (i=0; i<tptr->size; i++)
 325         {
 326                 node = tptr->bucket[i];
 327                 while (node != NULL)
 328                 {
 329                    last = node;
 330                    node = node->next;
 331
 332                    int ret=0;
 333                    dataDict *dd = (dataDict *)last->data;
 334
 335                    simpleList *l = &dd->maybe;
 336                    l->setFirst();
 337                    int iNumWrdsAfterDelete = dd->numWrd;
 338                    while (ret>=0)
 339                     {
 340                       infoDict *ptr = (infoDict *)l->getIndex();
 341
 342                       float fRange = (float) ptr->num / (float) dd->numWrd ;
 343
 344                         if (fRange < dratio)
 345                         {
 346                           dd->numMaybe--;
 347                           iNumWrdsAfterDelete =  iNumWrdsAfterDelete - ptr->num;
 348                           l->delIndex(); //Eliminar pos
 349                         }
 350                       ret = l->next();
 351                     }
 352                   dd->numWrd =  iNumWrdsAfterDelete;
 353                   l->setFirst();
 354                 }
 355         }
 356 }
 357
 358 /**************************************************/
 359
 360 int dictionary::readInt(FILE *in)
 361 {       int i=0;
 362  char value[10];
 363  char c=' ';
 364
 365  strcpy(value,"");
 366
 367  while ((c==' ') && (!feof(in))) c=fgetc(in);
 368  while ((i<10) && (c!=' ') && (c!='\n') && (!feof(in)))
 369    {
 370      sprintf(value,"%s%c",value,c);
 371      c=fgetc(in); i++;
 372    }
 373  return atoi(value);
 374 }
 375
 376 /**************************************************/
 377
 378 infoDict *dictionary::readData(FILE *in)
 379 {
 380   infoDict *data = new infoDict;
 381   char c=fgetc(in);
 382   int i = 0;
 383
 384   strcpy(data->txt,"");
 385
 386   while ( (i<TAMTXT) && (c!=' ' && c!='\n' && c!='\t') && (!feof(in)) )
 387     {
 388       sprintf(data->txt,"%s%c",data->txt,c); c=fgetc(in); i++;
 389     }
 390   data->num  = readInt(in);
 391   return data;
 392 }
 393
 394 /**************************************************/
 395
 396 void dictionary::dictAddBackup(char *name)
 397 {
 398   FILE *f = openFile(name,"r");
 399
 400   char wrd[250],pos[10];
 401   int ret,i;
 402   dataDict *aux;
 403   infoDict *data;
 404
 405   // Bucle para leer lista de palabras
 406   while (!feof(f))
 407     {
 408       data = readData(f);
 409       i = readInt(f);
 410       uintptr_t w = hash_lookup(&d,wrd);
 411       if (w==HASH_FAIL)
 412       {
 413          aux = new dataDict;
 414          strcpy(aux->wrd,data->txt);
 415          aux->numWrd = 0;
 416          aux->numMaybe = 0;
 417       }
 418       else  aux = (dataDict *) w;
 419
 420       aux->numWrd += data->num;
 421       delete data;
 422       while (i>0)
 423       {
 424           data = readData(f);
 425           ret=1;
 426           //Buscamos si ja existe en la lista.
 427           for (int j=aux->numMaybe;ret>=0 &&  j>0; j--)
 428             {
 429               infoDict *element = (infoDict *)aux->maybe.getIndex();
 430               if (strcmp(data->txt,element->txt)==0)
 431                 {
 432                   ret = -1;
 433                   element->num += data->num;
 434                 }
 435               else ret = aux->maybe.next();
 436             }
 437           //Si no encontrado lo añadimos a la lista
 438           if (ret!=-1)
 439             {
 440               aux->maybe.add(data);
 441               aux->numMaybe++;
 442             }
 443           else delete data;
 444           i--;
 445         }
 446        if (w==HASH_FAIL) hash_insert(&d,aux->wrd,(uintptr_t) aux);
 447
 448     } //End while not eof
 449    fclose(f);
 450 }
 451
 452 /**************************************************/
 453
 454 void dictionary::dictLoad(FILE *in)
 455 {
 456   char c='\0';
 457   char wrd[25]="";
 458   int i=0,number;
 459   dataDict *aux;
 460   infoDict *data;
 461
 462   while (!feof(in))
 463     {
 464       data = readData(in);
 465       i = readInt(in);
 466       aux = new dataDict;
 467
 468       strcpy(aux->wrd,data->txt);
 469
 470       aux->numWrd = data->num;
 471       aux->numMaybe = i;
 472       delete data;
 473
 474       //Si leemos una línea con numMaybe = 0, es decir, sin
 475       //ninguna etiqueta no la cargaremos.
 476       if ( aux->numMaybe > 0)
 477         {
 478           while (i>0)
 479             {
 480               data = readData(in);
 481               aux->maybe.add(data);
 482
 483               i--;
 484             }
 485           hash_insert(&d,aux->wrd,(uintptr_t) aux);
 486         }
 487       else delete aux;
 488     }
 489 }
 490
 491 /**************************************************/
 492
 493 int dictionary::getElement(char *key)
 494 {
 495   return hash_lookup(&d,key);
 496 }
 497
 498 /**************************************************/
 499
 500 char *dictionary::getElementWord(uintptr_t ptr)
 501 {
 502   dataDict *aux = (dataDict *) ptr;
 503   return aux->wrd;
 504 }
 505
 506 /**************************************************/
 507
 508 int dictionary::getElementNumWord(uintptr_t ptr)
 509 {
 510   dataDict *aux = (dataDict *) ptr;
 511   return aux->numWrd;
 512 }
 513
 514 /**************************************************/
 515
 516 int dictionary::getElementNumMaybe(uintptr_t ptr)
 517 {
 518   dataDict *aux = (dataDict *) ptr;
 519   return aux->numMaybe;
 520 }
 521
 522 /**************************************************/
 523
 524 simpleList *dictionary::getElementMaybe(uintptr_t ptr)
 525 {
 526   dataDict *aux = (dataDict *) ptr;
 527   return &aux->maybe;
 528 }
 529
 530 /**************************************************/
 531
 532 dictionary::dictionary(char *name,char *backup)
 533 {
 534   FILE *in = openFile(name,"r");
 535   hash_init(&d,1000);
 536   dictLoad(in);
 537   fclose(in);
 538   dictAddBackup(backup);
 539 }
 540
 541 /**************************************************/
 542
 543 dictionary::dictionary(char *name)
 544 {
 545   FILE *in = openFile(name,"r");
 546   hash_init(&d,1000);
 547   dictLoad(in);
 548   fclose(in);
 549 }
 550
 551 /**************************************************/
 552
 553 dictionary::dictionary(char *name,int limInf, int limSup)
 554 {
 555   FILE *in = openFile(name,"r");
 556   char str[200];
 557   hash_init(&d,1000);
 558   dictCreate(in,limInf,limSup);
 559   fclose(in);
 560 }
 561
 562 /**************************************************/
 563
 564 void dictionary::dictCleanListInfoDict(simpleList *l, int num)
 565 {
 566   infoDict *data = NULL;
 567
 568           l->setFirst();
 569           if ( num >= 1 ) //Si tiene mas de un maybe es ambigua
 570           {
 571                 int ret  = 1;
 572                 while (ret>=0)
 573                 {
 574                   data = (infoDict *) l->getIndex();
 575                   delete data;
 576                   ret=l->next();
 577                 }
 578                 l->setFirst();
 579           } /* if */
 580 }
 581
 582 /**************************************************/
 583
 584 dictionary::~dictionary()
 585 {
 586   infoDict *data;
 587   dataDict *aux;
 588
 589   //Recorrer cada entrada del diccionario eliminando el contenido de las listas
 590   hash_t *tptr = &d;
 591
 592   hash_node_t **old_bucket, *old_hash, *tmp;
 593   int old_size, h, i;
 594
 595   old_bucket=tptr->bucket;
 596   old_size=tptr->size;
 597
 598   for (i=0; i<old_size; i++)
 599     {
 600       old_hash=old_bucket[i];
 601       while(old_hash)
 602         {
 603           tmp=old_hash;
 604           old_hash=old_hash->next;
 605
 606           aux = (dataDict *) tmp->data;
 607
 608           dictCleanListInfoDict(&aux->maybe,aux->numMaybe);
 609           delete aux;
 610           aux = NULL;
 611         } /* while */
 612     } /* for */
 613
 614   //Destruir hashing
 615   hash_destroy(&d);
 616 }
 617
 618 /**************************************************/
 619
 620 hash_t *dictionary::dictFindAmbP(int *numPOS)
 621 {
 622   int ret=0;
 623   infoDict *data;
 624   dataDict *aux;
 625
 626   hash_t *ambp = new hash_t;
 627   hash_t *tptr = &d;
 628   hash_init(ambp,30);
 629
 630   hash_node_t **old_bucket, *old_hash, *tmp;
 631   int old_size, h, i;
 632
 633   old_bucket=tptr->bucket;
 634   old_size=tptr->size;
 635
 636   *numPOS = 0;
 637
 638   for (i=0; i<old_size; i++)
 639     {
 640       old_hash=old_bucket[i];
 641       while(old_hash)
 642         {
 643           tmp=old_hash;
 644           old_hash=old_hash->next;
 645
 646           aux = (dataDict *) tmp->data;
 647           aux->maybe.setFirst();
 648           if (aux->numMaybe>1) //Si tiene mas de un maybe es ambigua
 649           {
 650                 ret  = 1;
 651                 while (ret>=0)
 652                 {
 653                 data = (infoDict *) aux->maybe.getIndex();
 654                 infoDict * tmp = new infoDict;
 655                 strcpy(tmp->txt,data->txt);
 656                 tmp->num = data->num;
 657                 hash_insert(ambp,tmp->txt,(uintptr_t) tmp);
 658
 659                 *numPOS++;
 660                 ret=aux->maybe.next();
 661                 }
 662                 aux->maybe.setFirst();
 663           } /* if */
 664         } /* while */
 665     } /* for */
 666   return ambp;
 667 }
 668
 669 /**************************************************/
 670
 671 hash_t *dictionary::dictFindUnkP(int *numPOS)
 672 {
 673   int ret=0;
 674   infoDict *data;
 675   dataDict *aux;
 676
 677   hash_t *unkp = new hash_t;
 678   hash_t *tptr = &d;
 679   hash_init(unkp,30);
 680
 681   hash_node_t **old_bucket, *old_hash, *tmp;
 682   int old_size, h, i;
 683
 684   old_bucket=tptr->bucket;
 685   old_size=tptr->size;
 686
 687         *numPOS = 0;
 688
 689   for (i=0; i<old_size; i++)
 690     {
 691       old_hash=old_bucket[i];
 692       while(old_hash)
 693         {
 694           tmp=old_hash;
 695           old_hash=old_hash->next;
 696
 697           aux = (dataDict *) tmp->data;
 698           aux->maybe.setFirst();
 699           if (aux->numWrd==1) //Si solo aparece una vez desconocida
 700           {
 701                 ret  = 1;
 702                 while (ret>=0)
 703                 {
 704                 data = (infoDict *) aux->maybe.getIndex();
 705                 infoDict * tmp = new infoDict;
 706                 strcpy(tmp->txt,data->txt);
 707                 tmp->num = data->num;
 708                 hash_insert(unkp,tmp->txt,(uintptr_t) tmp);
 709                 *numPOS++;
 710                 ret=aux->maybe.next();
 711                 }
 712                 aux->maybe.setFirst();
 713           } /* if */
 714         } /* while */
 715     } /* for */
 716   return unkp;
 717 }
 718