2 * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 extern int verbose_svmtool
;
32 /**************************************************/
34 char *dictionary::getMFT(int w
)
36 if (w
==HASH_FAIL
) return NULL
;
39 char *mft
= new char[TAM_POS
];
40 simpleList
*l
= this->getElementMaybe(w
);
46 ptr
= (infoDict
*)l
->getIndex();
48 { strcpy(mft
,ptr
->txt
);
57 /**************************************************/
59 char *dictionary::getAmbiguityClass(int w
)
61 char *amb
= new char[200];
64 sprintf(amb
,"UNKNOWN");
70 simpleList
*l
= this->getElementMaybe(w
);
71 int numMaybe
= this->getElementNumMaybe(w
);
78 ptr
= (infoDict
*)l
->getIndex();
79 // fprintf(stderr," %s %d",ptr->txt,ptr->num);
80 if (numMaybe
>0) sprintf(amb
,"%s%s_",amb
,ptr
->txt
);
81 else sprintf(amb
,"%s%s",amb
,ptr
->txt
);
88 /**************************************************/
90 void dictionary::dictIncInfo(dataDict
*elem
, char *pos
)
98 pInfoDict
= (infoDict
*) elem
->maybe
.getIndex();
99 if (strcmp(pInfoDict
->txt
,pos
)==0)
102 elem
->maybe
.setFirst();
105 ret
=elem
->maybe
.next();
107 pInfoDict
= new infoDict
;
108 strcpy(pInfoDict
->txt
,pos
);
110 elem
->maybe
.add(pInfoDict
);
112 elem
->maybe
.setFirst();
115 /**************************************************/
117 void dictionary::dictWrite(char *outName
)
122 int cont
=0,contWords
=0;
123 char stringPOS
[1000];
125 FILE *f
= openFile(outName
,"w");
129 hash_node_t
**old_bucket
, *old_hash
, *tmp
;
132 old_bucket
=tptr
->bucket
;
136 for (i
=0; i
<old_size
; i
++)
138 old_hash
=old_bucket
[i
];
142 old_hash
=old_hash
->next
;
144 aux
= (dataDict
*) tmp
->data
;
145 //fprintf(stderr,"\n%s %d %d",aux->wrd,aux->numWrd,aux->numMaybe);
146 fprintf(f
,"%s %d %d",aux
->wrd
,aux
->numWrd
,aux
->numMaybe
);
149 contWords
= aux
->numWrd
+contWords
;
151 strcpy(stringPOS
,"");
152 while (aux
->numMaybe
>0 && ret
>=0)
154 data
= (infoDict
*) aux
->maybe
.getIndex();
155 //fprintf(stderr," [ %s %d ]",data->txt,data->num);
157 if (strlen(stringPOS
)==0) sprintf(stringPOS
,"%s %d",data
->txt
,data
->num
);
158 else sprintf(stringPOS
,"%s %s %d",stringPOS
,data
->txt
,data
->num
);
160 ret
=aux
->maybe
.next();
163 char *szOut
= new char[strlen(stringPOS
)+1];
164 ordenarStringPorParejas(stringPOS
, szOut
, 0, stringPOS
);
165 fprintf(f
," %s\n",szOut
);
175 /**************************************************/
177 void dictionary::dictCreate(FILE *f
,int limitInf
,int limitSup
)
179 int retW
=0,retP
=0,contWords
=0,cont
=0,contWordsAdded
=0;
183 char wrd
[200],pos
[10];
184 int no_chunk
= FALSE
;
186 if ( verbose_svmtool
== TRUE
) fprintf(stderr
,"\nCreating Dictionary");
188 if (limitInf
== 0 && limitSup
== 0) no_chunk
= TRUE
;
190 while (retP
>=0 && retW
>=0)
192 if ( verbose_svmtool
== TRUE
) showProcessDone(contWordsAdded
, 1000, FALSE
,"");
193 retW
= readString(f
, wrd
);
194 char *real
= new char [strlen(wrd
)+1];
196 retP
= readString(f
, pos
);
198 if (retW
>=0 && retP
>=0)
200 int erRet
=erLookRegExp(wrd
);
203 case CARD
: strcpy(wrd
,"@CARD"); break;
204 case CARDSEPS
: strcpy(wrd
,"@CARDSEPS"); break;
205 case CARDPUNCT
: strcpy(wrd
,"@CARDPUNCT"); break;
206 case CARDSUFFIX
: strcpy(wrd
,"@CARDSUFFIX"); break;
209 int is_valid_for_limit_inf
= ( (contWords
< limitInf
) || contWords
== 0 );
210 int is_valid_for_limit_sup
= (contWords
>= limitSup
);
212 if ( no_chunk
== TRUE
|| is_valid_for_limit_inf
|| is_valid_for_limit_sup
)
214 if ((uintptr_t)(aux
=(dataDict
*)hash_lookup(&d
,wrd
)) == HASH_FAIL
)
217 strcpy(aux
->wrd
,wrd
);
221 strcpy(data
->txt
,pos
);
223 aux
->maybe
.add(data
);
224 hash_insert(&d
,aux
->wrd
,(uintptr_t) aux
);
227 else dictIncInfo(aux
,pos
);
230 if (strcmp(wrd
,"@CARD")==0 || strcmp(wrd
,"@CARDPUNCT")==0
231 || strcmp(wrd
,"@CARDSEPS")==0 || strcmp(wrd
,"@CARDSUFFIX")==0)
233 if ((uintptr_t)(aux2
=(dataDict
*)hash_lookup(&d
,real
)) == HASH_FAIL
)
236 strcpy(aux2
->wrd
,real
);
240 strcpy(data
->txt
,pos
);
242 aux2
->maybe
.add(data
);
243 hash_insert(&d
,aux2
->wrd
,(uintptr_t) aux2
);
246 else dictIncInfo(aux2
,pos
);
254 if ( verbose_svmtool
== TRUE
) fprintf(stderr
,"[ %d words ]",cont
);
257 /**************************************************/
259 void dictionary::dictRepairFromFile(char *fileName
)
261 if ( verbose_svmtool
== TRUE
) fprintf(stderr
,"\nReparing Dictionary with file < %s >",fileName
);
262 FILE *f
= openFile(fileName
,"r");
264 char wrd
[250],pos
[10];
265 int numWrd
,numMaybe
,numWrdxPOS
;
269 // Bucle para leer lista de palabras
272 fscanf(f
,"%s %d %d",wrd
,&numWrd
,&numMaybe
);
273 int w
= hash_lookup(&d
,wrd
);
277 strcpy(aux
->wrd
,wrd
);
278 aux
->numWrd
= getElementNumWord(w
);
281 simpleList
*l
= getElementMaybe(w
);
283 for (int i
=0;i
<numMaybe
;i
++)
285 fscanf(f
,"%s %d",pos
,&numWrdxPOS
);
290 infoDict
*ptr
= (infoDict
*)l
->getIndex();
291 if (strcmp(pos
,ptr
->txt
)==0)
293 //Copiamos elemento a añadir
294 infoDict
*tmpInfoDict
= new infoDict
;
295 strcpy(tmpInfoDict
->txt
,ptr
->txt
);
296 tmpInfoDict
->num
= ptr
->num
;
298 aux
->maybe
.add(tmpInfoDict
);
301 delete ptr
; //Borrar substituido
303 else ret
= l
->next();
309 delete (dataDict
*) hash_delete (&d
,wrd
);
310 hash_insert(&d
,aux
->wrd
,(uintptr_t) aux
);
316 /**************************************************/
318 void dictionary::dictRepairHeuristic(float dratio
)
321 hash_node_t
*node
, *last
;
324 for (i
=0; i
<tptr
->size
; i
++)
326 node
= tptr
->bucket
[i
];
333 dataDict
*dd
= (dataDict
*)last
->data
;
335 simpleList
*l
= &dd
->maybe
;
337 int iNumWrdsAfterDelete
= dd
->numWrd
;
340 infoDict
*ptr
= (infoDict
*)l
->getIndex();
342 float fRange
= (float) ptr
->num
/ (float) dd
->numWrd
;
347 iNumWrdsAfterDelete
= iNumWrdsAfterDelete
- ptr
->num
;
348 l
->delIndex(); //Eliminar pos
352 dd
->numWrd
= iNumWrdsAfterDelete
;
358 /**************************************************/
360 int dictionary::readInt(FILE *in
)
367 while ((c
==' ') && (!feof(in
))) c
=fgetc(in
);
368 while ((i
<10) && (c
!=' ') && (c
!='\n') && (!feof(in
)))
370 sprintf(value
,"%s%c",value
,c
);
376 /**************************************************/
378 infoDict
*dictionary::readData(FILE *in
)
380 infoDict
*data
= new infoDict
;
384 strcpy(data
->txt
,"");
386 while ( (i
<TAMTXT
) && (c
!=' ' && c
!='\n' && c
!='\t') && (!feof(in
)) )
388 sprintf(data
->txt
,"%s%c",data
->txt
,c
); c
=fgetc(in
); i
++;
390 data
->num
= readInt(in
);
394 /**************************************************/
396 void dictionary::dictAddBackup(char *name
)
398 FILE *f
= openFile(name
,"r");
400 char wrd
[250],pos
[10];
405 // Bucle para leer lista de palabras
410 uintptr_t w
= hash_lookup(&d
,wrd
);
414 strcpy(aux
->wrd
,data
->txt
);
418 else aux
= (dataDict
*) w
;
420 aux
->numWrd
+= data
->num
;
426 //Buscamos si ja existe en la lista.
427 for (int j
=aux
->numMaybe
;ret
>=0 && j
>0; j
--)
429 infoDict
*element
= (infoDict
*)aux
->maybe
.getIndex();
430 if (strcmp(data
->txt
,element
->txt
)==0)
433 element
->num
+= data
->num
;
435 else ret
= aux
->maybe
.next();
437 //Si no encontrado lo añadimos a la lista
440 aux
->maybe
.add(data
);
446 if (w
==HASH_FAIL
) hash_insert(&d
,aux
->wrd
,(uintptr_t) aux
);
448 } //End while not eof
452 /**************************************************/
454 void dictionary::dictLoad(FILE *in
)
468 strcpy(aux
->wrd
,data
->txt
);
470 aux
->numWrd
= data
->num
;
474 //Si leemos una lĂnea con numMaybe = 0, es decir, sin
475 //ninguna etiqueta no la cargaremos.
476 if ( aux
->numMaybe
> 0)
481 aux
->maybe
.add(data
);
485 hash_insert(&d
,aux
->wrd
,(uintptr_t) aux
);
491 /**************************************************/
493 int dictionary::getElement(char *key
)
495 return hash_lookup(&d
,key
);
498 /**************************************************/
500 char *dictionary::getElementWord(uintptr_t ptr
)
502 dataDict
*aux
= (dataDict
*) ptr
;
506 /**************************************************/
508 int dictionary::getElementNumWord(uintptr_t ptr
)
510 dataDict
*aux
= (dataDict
*) ptr
;
514 /**************************************************/
516 int dictionary::getElementNumMaybe(uintptr_t ptr
)
518 dataDict
*aux
= (dataDict
*) ptr
;
519 return aux
->numMaybe
;
522 /**************************************************/
524 simpleList
*dictionary::getElementMaybe(uintptr_t ptr
)
526 dataDict
*aux
= (dataDict
*) ptr
;
530 /**************************************************/
532 dictionary::dictionary(char *name
,char *backup
)
534 FILE *in
= openFile(name
,"r");
538 dictAddBackup(backup
);
541 /**************************************************/
543 dictionary::dictionary(char *name
)
545 FILE *in
= openFile(name
,"r");
551 /**************************************************/
553 dictionary::dictionary(char *name
,int limInf
, int limSup
)
555 FILE *in
= openFile(name
,"r");
558 dictCreate(in
,limInf
,limSup
);
562 /**************************************************/
564 void dictionary::dictCleanListInfoDict(simpleList
*l
, int num
)
566 infoDict
*data
= NULL
;
569 if ( num
>= 1 ) //Si tiene mas de un maybe es ambigua
574 data
= (infoDict
*) l
->getIndex();
582 /**************************************************/
584 dictionary::~dictionary()
589 //Recorrer cada entrada del diccionario eliminando el contenido de las listas
592 hash_node_t
**old_bucket
, *old_hash
, *tmp
;
595 old_bucket
=tptr
->bucket
;
598 for (i
=0; i
<old_size
; i
++)
600 old_hash
=old_bucket
[i
];
604 old_hash
=old_hash
->next
;
606 aux
= (dataDict
*) tmp
->data
;
608 dictCleanListInfoDict(&aux
->maybe
,aux
->numMaybe
);
618 /**************************************************/
620 hash_t
*dictionary::dictFindAmbP(int *numPOS
)
626 hash_t
*ambp
= new hash_t
;
630 hash_node_t
**old_bucket
, *old_hash
, *tmp
;
633 old_bucket
=tptr
->bucket
;
638 for (i
=0; i
<old_size
; i
++)
640 old_hash
=old_bucket
[i
];
644 old_hash
=old_hash
->next
;
646 aux
= (dataDict
*) tmp
->data
;
647 aux
->maybe
.setFirst();
648 if (aux
->numMaybe
>1) //Si tiene mas de un maybe es ambigua
653 data
= (infoDict
*) aux
->maybe
.getIndex();
654 infoDict
* tmp
= new infoDict
;
655 strcpy(tmp
->txt
,data
->txt
);
656 tmp
->num
= data
->num
;
657 hash_insert(ambp
,tmp
->txt
,(uintptr_t) tmp
);
660 ret
=aux
->maybe
.next();
662 aux
->maybe
.setFirst();
669 /**************************************************/
671 hash_t
*dictionary::dictFindUnkP(int *numPOS
)
677 hash_t
*unkp
= new hash_t
;
681 hash_node_t
**old_bucket
, *old_hash
, *tmp
;
684 old_bucket
=tptr
->bucket
;
689 for (i
=0; i
<old_size
; i
++)
691 old_hash
=old_bucket
[i
];
695 old_hash
=old_hash
->next
;
697 aux
= (dataDict
*) tmp
->data
;
698 aux
->maybe
.setFirst();
699 if (aux
->numWrd
==1) //Si solo aparece una vez desconocida
704 data
= (infoDict
*) aux
->maybe
.getIndex();
705 infoDict
* tmp
= new infoDict
;
706 strcpy(tmp
->txt
,data
->txt
);
707 tmp
->num
= data
->num
;
708 hash_insert(unkp
,tmp
->txt
,(uintptr_t) tmp
);
710 ret
=aux
->maybe
.next();
712 aux
->maybe
.setFirst();