Fixed Makefile to compile program using shared library
[svmtool++.git] / src / dict.cc
bloba1ee728cd3e051186e15209ac6e643857e15fec3
1 /*
2 * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include "hash.h"
23 #include "list.h"
24 #include "dict.h"
25 #include "swindow.h"
26 #include "common.h"
27 #include "er.h"
30 extern int verbose_svmtool;
32 /**************************************************/
34 char *dictionary::getMFT(int w)
36 if (w==HASH_FAIL) return NULL;
38 int max=0,ret = 1;
39 char *mft = new char[TAM_POS];
40 simpleList *l = this->getElementMaybe(w);
41 infoDict *ptr;
43 l->setFirst();
44 while (ret>=0)
46 ptr = (infoDict *)l->getIndex();
47 if (max<ptr->num)
48 { strcpy(mft,ptr->txt);
49 max = ptr->num;
51 ret = l->next();
53 l->setFirst();
54 return mft;
57 /**************************************************/
59 char *dictionary::getAmbiguityClass(int w)
61 char *amb = new char[200];
62 if (w==HASH_FAIL)
64 sprintf(amb,"UNKNOWN");
65 return amb;
68 int ret = 1;
69 strcpy(amb,"");
70 simpleList *l = this->getElementMaybe(w);
71 int numMaybe = this->getElementNumMaybe(w);
72 infoDict *ptr;
74 l->setFirst();
75 while (ret>=0)
77 numMaybe--;
78 ptr = (infoDict *)l->getIndex();
79 // fprintf(stderr," %s %d",ptr->txt,ptr->num);
80 if (numMaybe>0) sprintf(amb,"%s%s_",amb,ptr->txt);
81 else sprintf(amb,"%s%s",amb,ptr->txt);
82 ret = l->next();
84 l->setFirst();
85 return amb;
88 /**************************************************/
90 void dictionary::dictIncInfo(dataDict *elem, char *pos)
92 int ret=1;
93 infoDict *pInfoDict;
95 elem->numWrd++;
96 while (ret>=0)
98 pInfoDict = (infoDict *) elem->maybe.getIndex();
99 if (strcmp(pInfoDict->txt,pos)==0)
101 pInfoDict->num++;
102 elem->maybe.setFirst();
103 return;
105 ret=elem->maybe.next();
107 pInfoDict = new infoDict;
108 strcpy(pInfoDict->txt,pos);
109 pInfoDict->num=1;
110 elem->maybe.add(pInfoDict);
111 elem->numMaybe++;
112 elem->maybe.setFirst();
115 /**************************************************/
117 void dictionary::dictWrite(char *outName)
119 int ret=0;
120 infoDict *data;
121 dataDict *aux;
122 int cont=0,contWords=0;
123 char stringPOS[1000];
125 FILE *f = openFile(outName,"w");
127 hash_t *tptr = &d;
129 hash_node_t **old_bucket, *old_hash, *tmp;
130 int old_size, h, i;
132 old_bucket=tptr->bucket;
133 old_size=tptr->size;
136 for (i=0; i<old_size; i++)
138 old_hash=old_bucket[i];
139 while(old_hash)
141 tmp=old_hash;
142 old_hash=old_hash->next;
144 aux = (dataDict *) tmp->data;
145 //fprintf(stderr,"\n%s %d %d",aux->wrd,aux->numWrd,aux->numMaybe);
146 fprintf(f,"%s %d %d",aux->wrd,aux->numWrd,aux->numMaybe);
148 cont++;
149 contWords = aux->numWrd+contWords;
150 ret = 1;
151 strcpy(stringPOS,"");
152 while (aux->numMaybe>0 && ret>=0)
154 data = (infoDict *) aux->maybe.getIndex();
155 //fprintf(stderr," [ %s %d ]",data->txt,data->num);
157 if (strlen(stringPOS)==0) sprintf(stringPOS,"%s %d",data->txt,data->num);
158 else sprintf(stringPOS,"%s %s %d",stringPOS,data->txt,data->num);
160 ret=aux->maybe.next();
163 char *szOut = new char[strlen(stringPOS)+1];
164 ordenarStringPorParejas(stringPOS, szOut, 0, stringPOS);
165 fprintf(f," %s\n",szOut);
167 delete szOut;
168 } /* while */
169 } /* for */
171 fclose(f);
172 return;
175 /**************************************************/
177 void dictionary::dictCreate(FILE *f,int limitInf,int limitSup)
179 int retW=0,retP=0,contWords=0,cont=0,contWordsAdded=0;
180 infoDict *data;
181 dataDict *aux,*aux2;
182 nodo *elem;
183 char wrd[200],pos[10];
184 int no_chunk = FALSE;
186 if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nCreating Dictionary");
188 if (limitInf == 0 && limitSup == 0) no_chunk = TRUE;
190 while (retP>=0 && retW>=0)
192 if ( verbose_svmtool == TRUE) showProcessDone(contWordsAdded, 1000, FALSE,"");
193 retW = readString(f, wrd);
194 char *real = new char [strlen(wrd)+1];
195 strcpy(real,wrd);
196 retP = readString(f, pos);
198 if (retW>=0 && retP>=0)
200 int erRet=erLookRegExp(wrd);
201 switch (erRet)
203 case CARD: strcpy(wrd,"@CARD"); break;
204 case CARDSEPS: strcpy(wrd,"@CARDSEPS"); break;
205 case CARDPUNCT: strcpy(wrd,"@CARDPUNCT"); break;
206 case CARDSUFFIX: strcpy(wrd,"@CARDSUFFIX"); break;
209 int is_valid_for_limit_inf = ( (contWords < limitInf) || contWords == 0 );
210 int is_valid_for_limit_sup = (contWords >= limitSup );
212 if ( no_chunk == TRUE || is_valid_for_limit_inf || is_valid_for_limit_sup )
214 if ((uintptr_t)(aux=(dataDict *)hash_lookup(&d,wrd)) == HASH_FAIL)
216 aux= new dataDict;
217 strcpy(aux->wrd,wrd);
218 aux->numMaybe = 1;
219 aux->numWrd = 1;
220 data = new infoDict;
221 strcpy(data->txt,pos);
222 data->num=1;
223 aux->maybe.add(data);
224 hash_insert(&d,aux->wrd,(uintptr_t) aux);
225 cont++;
227 else dictIncInfo(aux,pos);
228 contWordsAdded++;
230 if (strcmp(wrd,"@CARD")==0 || strcmp(wrd,"@CARDPUNCT")==0
231 || strcmp(wrd,"@CARDSEPS")==0 || strcmp(wrd,"@CARDSUFFIX")==0)
233 if ((uintptr_t)(aux2=(dataDict *)hash_lookup(&d,real)) == HASH_FAIL)
235 aux2 = new dataDict;
236 strcpy(aux2->wrd,real);
237 aux2->numMaybe = 1;
238 aux2->numWrd = 1;
239 data = new infoDict;
240 strcpy(data->txt,pos);
241 data->num = 1;
242 aux2->maybe.add(data);
243 hash_insert(&d,aux2->wrd,(uintptr_t) aux2);
244 cont++;
246 else dictIncInfo(aux2,pos);
250 contWords++;
251 delete real;
254 if ( verbose_svmtool == TRUE ) fprintf(stderr,"[ %d words ]",cont);
257 /**************************************************/
259 void dictionary::dictRepairFromFile(char *fileName)
261 if ( verbose_svmtool == TRUE ) fprintf(stderr,"\nReparing Dictionary with file < %s >",fileName);
262 FILE *f = openFile(fileName,"r");
264 char wrd[250],pos[10];
265 int numWrd,numMaybe,numWrdxPOS;
266 dataDict *aux;
269 // Bucle para leer lista de palabras
270 while (!feof(f))
272 fscanf(f,"%s %d %d",wrd,&numWrd,&numMaybe);
273 int w = hash_lookup(&d,wrd);
274 if (w!=HASH_FAIL)
276 aux = new dataDict;
277 strcpy(aux->wrd,wrd);
278 aux->numWrd = getElementNumWord(w);
279 aux->numMaybe = 0;
281 simpleList *l = getElementMaybe(w);
283 for (int i=0;i<numMaybe;i++)
285 fscanf(f,"%s %d",pos,&numWrdxPOS);
286 int ret=0;
288 while (ret>=0)
290 infoDict *ptr = (infoDict *)l->getIndex();
291 if (strcmp(pos,ptr->txt)==0)
293 //Copiamos elemento a añadir
294 infoDict *tmpInfoDict = new infoDict;
295 strcpy(tmpInfoDict->txt,ptr->txt);
296 tmpInfoDict->num = ptr->num;
298 aux->maybe.add(tmpInfoDict);
299 aux->numMaybe++;
300 ret = -1;
301 delete ptr; //Borrar substituido
303 else ret = l->next();
306 l->setFirst();
309 delete (dataDict *) hash_delete (&d,wrd);
310 hash_insert(&d,aux->wrd,(uintptr_t) aux);
313 fclose(f);
316 /**************************************************/
318 void dictionary::dictRepairHeuristic(float dratio)
320 hash_t *tptr = &d;
321 hash_node_t *node, *last;
322 int i;
324 for (i=0; i<tptr->size; i++)
326 node = tptr->bucket[i];
327 while (node != NULL)
329 last = node;
330 node = node->next;
332 int ret=0;
333 dataDict *dd = (dataDict *)last->data;
335 simpleList *l = &dd->maybe;
336 l->setFirst();
337 int iNumWrdsAfterDelete = dd->numWrd;
338 while (ret>=0)
340 infoDict *ptr = (infoDict *)l->getIndex();
342 float fRange = (float) ptr->num / (float) dd->numWrd ;
344 if (fRange < dratio)
346 dd->numMaybe--;
347 iNumWrdsAfterDelete = iNumWrdsAfterDelete - ptr->num;
348 l->delIndex(); //Eliminar pos
350 ret = l->next();
352 dd->numWrd = iNumWrdsAfterDelete;
353 l->setFirst();
358 /**************************************************/
360 int dictionary::readInt(FILE *in)
361 { int i=0;
362 char value[10];
363 char c=' ';
365 strcpy(value,"");
367 while ((c==' ') && (!feof(in))) c=fgetc(in);
368 while ((i<10) && (c!=' ') && (c!='\n') && (!feof(in)))
370 sprintf(value,"%s%c",value,c);
371 c=fgetc(in); i++;
373 return atoi(value);
376 /**************************************************/
378 infoDict *dictionary::readData(FILE *in)
380 infoDict *data = new infoDict;
381 char c=fgetc(in);
382 int i = 0;
384 strcpy(data->txt,"");
386 while ( (i<TAMTXT) && (c!=' ' && c!='\n' && c!='\t') && (!feof(in)) )
388 sprintf(data->txt,"%s%c",data->txt,c); c=fgetc(in); i++;
390 data->num = readInt(in);
391 return data;
394 /**************************************************/
396 void dictionary::dictAddBackup(char *name)
398 FILE *f = openFile(name,"r");
400 char wrd[250],pos[10];
401 int ret,i;
402 dataDict *aux;
403 infoDict *data;
405 // Bucle para leer lista de palabras
406 while (!feof(f))
408 data = readData(f);
409 i = readInt(f);
410 uintptr_t w = hash_lookup(&d,wrd);
411 if (w==HASH_FAIL)
413 aux = new dataDict;
414 strcpy(aux->wrd,data->txt);
415 aux->numWrd = 0;
416 aux->numMaybe = 0;
418 else aux = (dataDict *) w;
420 aux->numWrd += data->num;
421 delete data;
422 while (i>0)
424 data = readData(f);
425 ret=1;
426 //Buscamos si ja existe en la lista.
427 for (int j=aux->numMaybe;ret>=0 && j>0; j--)
429 infoDict *element = (infoDict *)aux->maybe.getIndex();
430 if (strcmp(data->txt,element->txt)==0)
432 ret = -1;
433 element->num += data->num;
435 else ret = aux->maybe.next();
437 //Si no encontrado lo añadimos a la lista
438 if (ret!=-1)
440 aux->maybe.add(data);
441 aux->numMaybe++;
443 else delete data;
444 i--;
446 if (w==HASH_FAIL) hash_insert(&d,aux->wrd,(uintptr_t) aux);
448 } //End while not eof
449 fclose(f);
452 /**************************************************/
454 void dictionary::dictLoad(FILE *in)
456 char c='\0';
457 char wrd[25]="";
458 int i=0,number;
459 dataDict *aux;
460 infoDict *data;
462 while (!feof(in))
464 data = readData(in);
465 i = readInt(in);
466 aux = new dataDict;
468 strcpy(aux->wrd,data->txt);
470 aux->numWrd = data->num;
471 aux->numMaybe = i;
472 delete data;
474 //Si leemos una lĂ­nea con numMaybe = 0, es decir, sin
475 //ninguna etiqueta no la cargaremos.
476 if ( aux->numMaybe > 0)
478 while (i>0)
480 data = readData(in);
481 aux->maybe.add(data);
483 i--;
485 hash_insert(&d,aux->wrd,(uintptr_t) aux);
487 else delete aux;
491 /**************************************************/
493 int dictionary::getElement(char *key)
495 return hash_lookup(&d,key);
498 /**************************************************/
500 char *dictionary::getElementWord(uintptr_t ptr)
502 dataDict *aux = (dataDict *) ptr;
503 return aux->wrd;
506 /**************************************************/
508 int dictionary::getElementNumWord(uintptr_t ptr)
510 dataDict *aux = (dataDict *) ptr;
511 return aux->numWrd;
514 /**************************************************/
516 int dictionary::getElementNumMaybe(uintptr_t ptr)
518 dataDict *aux = (dataDict *) ptr;
519 return aux->numMaybe;
522 /**************************************************/
524 simpleList *dictionary::getElementMaybe(uintptr_t ptr)
526 dataDict *aux = (dataDict *) ptr;
527 return &aux->maybe;
530 /**************************************************/
532 dictionary::dictionary(char *name,char *backup)
534 FILE *in = openFile(name,"r");
535 hash_init(&d,1000);
536 dictLoad(in);
537 fclose(in);
538 dictAddBackup(backup);
541 /**************************************************/
543 dictionary::dictionary(char *name)
545 FILE *in = openFile(name,"r");
546 hash_init(&d,1000);
547 dictLoad(in);
548 fclose(in);
551 /**************************************************/
553 dictionary::dictionary(char *name,int limInf, int limSup)
555 FILE *in = openFile(name,"r");
556 char str[200];
557 hash_init(&d,1000);
558 dictCreate(in,limInf,limSup);
559 fclose(in);
562 /**************************************************/
564 void dictionary::dictCleanListInfoDict(simpleList *l, int num)
566 infoDict *data = NULL;
568 l->setFirst();
569 if ( num >= 1 ) //Si tiene mas de un maybe es ambigua
571 int ret = 1;
572 while (ret>=0)
574 data = (infoDict *) l->getIndex();
575 delete data;
576 ret=l->next();
578 l->setFirst();
579 } /* if */
582 /**************************************************/
584 dictionary::~dictionary()
586 infoDict *data;
587 dataDict *aux;
589 //Recorrer cada entrada del diccionario eliminando el contenido de las listas
590 hash_t *tptr = &d;
592 hash_node_t **old_bucket, *old_hash, *tmp;
593 int old_size, h, i;
595 old_bucket=tptr->bucket;
596 old_size=tptr->size;
598 for (i=0; i<old_size; i++)
600 old_hash=old_bucket[i];
601 while(old_hash)
603 tmp=old_hash;
604 old_hash=old_hash->next;
606 aux = (dataDict *) tmp->data;
608 dictCleanListInfoDict(&aux->maybe,aux->numMaybe);
609 delete aux;
610 aux = NULL;
611 } /* while */
612 } /* for */
614 //Destruir hashing
615 hash_destroy(&d);
618 /**************************************************/
620 hash_t *dictionary::dictFindAmbP(int *numPOS)
622 int ret=0;
623 infoDict *data;
624 dataDict *aux;
626 hash_t *ambp = new hash_t;
627 hash_t *tptr = &d;
628 hash_init(ambp,30);
630 hash_node_t **old_bucket, *old_hash, *tmp;
631 int old_size, h, i;
633 old_bucket=tptr->bucket;
634 old_size=tptr->size;
636 *numPOS = 0;
638 for (i=0; i<old_size; i++)
640 old_hash=old_bucket[i];
641 while(old_hash)
643 tmp=old_hash;
644 old_hash=old_hash->next;
646 aux = (dataDict *) tmp->data;
647 aux->maybe.setFirst();
648 if (aux->numMaybe>1) //Si tiene mas de un maybe es ambigua
650 ret = 1;
651 while (ret>=0)
653 data = (infoDict *) aux->maybe.getIndex();
654 infoDict * tmp = new infoDict;
655 strcpy(tmp->txt,data->txt);
656 tmp->num = data->num;
657 hash_insert(ambp,tmp->txt,(uintptr_t) tmp);
659 *numPOS++;
660 ret=aux->maybe.next();
662 aux->maybe.setFirst();
663 } /* if */
664 } /* while */
665 } /* for */
666 return ambp;
669 /**************************************************/
671 hash_t *dictionary::dictFindUnkP(int *numPOS)
673 int ret=0;
674 infoDict *data;
675 dataDict *aux;
677 hash_t *unkp = new hash_t;
678 hash_t *tptr = &d;
679 hash_init(unkp,30);
681 hash_node_t **old_bucket, *old_hash, *tmp;
682 int old_size, h, i;
684 old_bucket=tptr->bucket;
685 old_size=tptr->size;
687 *numPOS = 0;
689 for (i=0; i<old_size; i++)
691 old_hash=old_bucket[i];
692 while(old_hash)
694 tmp=old_hash;
695 old_hash=old_hash->next;
697 aux = (dataDict *) tmp->data;
698 aux->maybe.setFirst();
699 if (aux->numWrd==1) //Si solo aparece una vez desconocida
701 ret = 1;
702 while (ret>=0)
704 data = (infoDict *) aux->maybe.getIndex();
705 infoDict * tmp = new infoDict;
706 strcpy(tmp->txt,data->txt);
707 tmp->num = data->num;
708 hash_insert(unkp,tmp->txt,(uintptr_t) tmp);
709 *numPOS++;
710 ret=aux->maybe.next();
712 aux->maybe.setFirst();
713 } /* if */
714 } /* while */
715 } /* for */
716 return unkp;