2 * Copyright (C) 2004 Jesus Gimenez, Lluis Marquez and Senen Moya
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 #include <sys/types.h>
39 #define MAX_SENTENCE_LENGTH 1000
40 #define MAX_LINE_LENGTH 4096
42 /*****************************************************************
44 *****************************************************************/
46 void swindow::winPushStartWithLowerFeature(const std::string
& wrd
, std::stack
<std::string
>&pila
)
50 //Comienza por Minuscula?
51 if (erLookRegExp2(&erStartLower
,wrd
))
55 std::ostringstream feat
;
56 feat
<< START_LOWER_MARK
<<":"<<startlower
;
58 pila
.push(feat
.str());
62 void swindow::winPushStartWithNumberFeature(const std::string
& wrd
, std::stack
<std::string
>&pila
)
66 //Comienza por Numero?
67 if (erLookRegExp2(&erStartNumber
,wrd
))
71 //mod Correcting dynamic memory errors
72 std::ostringstream feat
;
73 feat
<< START_NUMBER_MARK
<<":" << startnumber
;
75 pila
.push(feat
.str());
79 void swindow::winPushSuffixFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
,int longitud
)
81 //Obtenemos la longitud de la palabra
83 std::ostringstream feat
;
85 std::ostringstream suf
;
88 for (int i
=len
-longitud
; i
<=len
-1; i
++)
90 if (i
>=0) suf
<<wrd
[i
];
95 feat
<<SUFFIX_MARK
<<longitud
<<":"<<suf
.str();
96 pila
.push(feat
.str());
100 * void winPushPreffixFeatures ( std::string wrd, struct std::stack *pila, int longitud)
101 * esta funcion creara las "features" para la palabra desconocida
102 * <wrd> y las apilara en en el parametro <pila>
104 void swindow::winPushPrefixFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
, int longitud
)
106 //Obtenemos la longitud de la palabra
107 std::ostringstream feat
;
108 int len
= wrd
.size();
109 std::ostringstream pref
;
111 for (int i
=0; i
<longitud
; i
++)
113 if (len
> i
) pref
<<wrd
[i
];
114 else /*if (i > len-1 )*/ pref
<<"~";
117 feat
<<PREFIX_MARK
<<longitud
<<":"<<pref
.str();
118 pila
.push(feat
.str());
122 void swindow::winPushStartWithCapFeature(const std::string
& wrd
, std::stack
<std::string
>&pila
)
126 //Comienza por Mayuscula?
127 if (erLookRegExp2(&erStartCap
,wrd
))
130 //mod Correcting dynamic memory errors
131 //std::string feat = new char[strlen(START_CAPITAL_MARK)+4];
132 std::ostringstream feat
;
134 feat
<< START_CAPITAL_MARK
<<":"<<startcap
;
135 pila
.push(feat
.str());
139 void swindow::winPushAllUpFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
)
143 //Esta toda la palabra en mayusculas?
144 if (erLookRegExp2(&erAllUp
,wrd
))
148 //mod Correcting dynamic memory errors
149 //std::string feat = new char[4];
150 std::ostringstream feat
;
152 feat
<< ALL_UPPER_MARK
<<":"<<allup
;
153 pila
.push(feat
.str());
157 void swindow::winPushAllLowFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
)
160 //Esta toda la palabra en minusculas?
161 if (erLookRegExp2(&erAllLow
,wrd
))
165 //mod Correcting dynamic memory errors
166 //std::string feat = new char[4];
167 std::ostringstream feat
;
169 feat
<< ALL_LOWER_MARK
<<":"<<alllow
;
170 pila
.push(feat
.str());
174 void swindow::winPushContainCapFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
)
177 if (erLookRegExp2(&erContainCap
,wrd
))
181 //mod Correcting dynamic memory errors
182 //std::string feat = new char[4];
183 std::ostringstream feat
;
185 feat
<< CONTAIN_CAP_MARK
<<":"<<containcap
;
186 pila
.push(feat
.str());
190 void swindow::winPushContainCapsFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
)
193 if (erLookRegExp2(&erContainCaps
,wrd
))
197 //mod Correcting dynamic memory errors
199 std::ostringstream feat
;
201 feat
<< CONTAIN_CAPS_MARK
<<":"<<containcaps
;
202 pila
.push(feat
.str());
206 void swindow::winPushContainPeriodFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
)
208 int containperiod
= 0;
210 if (erLookRegExp2(&erContainPeriod
,wrd
))
213 std::stringstream feat
;
214 feat
<< CONTAIN_PERIOD_MARK
<<":"<<containperiod
;
215 pila
.push(feat
.str());
219 void swindow::winPushContainCommaFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
)
221 int containcomma
= 0;
223 if (erLookRegExp2(&erContainComma
,wrd
))
226 std::ostringstream feat
;
227 feat
<< CONTAIN_COMMA_MARK
<<":"<<containcomma
;
228 pila
.push(feat
.str());
232 void swindow::winPushContainNumFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
)
235 //Contiene un numero?
236 if (erLookRegExp2(&erContainNum
,wrd
))
240 std::ostringstream feat
;
242 //sprintf(feat,"CN:%d",containnum);
243 feat
<< CONTAIN_NUMBER_MARK
<<":"<<containnum
;
244 pila
.push(feat
.str());
248 void swindow::winPushMultiwordFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
)
251 //Es una palabra multiple?
252 if (erLookRegExp2(&erMultiWord
,wrd
))
256 //mod Correcting dynamic memory errors
257 //std::string feat = new char[6];
258 //sprintf(feat,"MW:%d",multiword);
259 std::ostringstream feat
;
260 feat
<< MULTIWORD_MARK
<<":"<<multiword
;
262 pila
.push(feat
.str());
266 void swindow::winPushLetterFeature(const std::string
& wrd
, std::stack
<std::string
>&pila
, int where
, int position
)
268 std::ostringstream feature
;
270 if (COUNTING_FROM_END
==where
)
272 feature
<<CHAR_Z_MARK
<< position
<<":"<<wrd
[wrd
.size()-position
];
276 feature
<<CHAR_A_MARK
<<position
<<":"<<wrd
[position
-1];
279 pila
.push(feature
.str());
282 void swindow::winPushLenghtFeature(const std::string
& wrd
, std::stack
<std::string
>& pila
)
284 //Obtenemos la longitud de la palabra
285 int len
= wrd
.size();
287 //Longitud de la palabra
288 //mod Correcting dynamic memory errors
289 //std::string feat = new char[4];
290 std::ostringstream feat
;
292 feat
<<LENGTH_MARK
<< ":"<<len
;
293 pila
.push(feat
.str());
297 * void winPushSwnFeature (struct std::stack *pila)
298 * Recibe como parametro <pila>, donde se apilara la "feature"
299 * Swn.Swn es el elemento final de frase que puede ser
302 void swindow::winPushSwnFeature(std::stack
<std::string
>& pila
)
304 std::string feature
= "Swn:";
311 * void winPushAmbiguityFeature(void *ptr, dictionary *d, std::stack *pila, int direction)
312 * Genera el atributo que representa la ambiguedad de una palabra.
313 * Recibe como parametros:
314 * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list)
315 * aunque se recibe como un void*.
316 * d, es el diccionario con el que estamos trabajarando
317 * pila,es la pila donde apilaremos el atributo generado
318 * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT
321 void swindow::winPushAmbiguityFeature(void* ptr
, dictionary
* d
, std::stack
<std::string
>& pila
, int direction
)
323 std::ostringstream value
;
324 nodo_feature_list
*p
= (nodo_feature_list
*)ptr
;
330 num
= *p
->l
.getIndex();
331 value
<< p
->mark
<< *num
<< ":";
332 pn
= get(*num
, direction
);
335 dataDict
* w
= d
->getElement(pn
->wrd
);
336 if ((long)w
!=HASH_FAIL
)
338 simpleList
<infoDict
*>& list
= d
->getElementMaybe(w
);
339 int numMaybe
= d
->getElementNumMaybe(w
);
343 pInfoDict
= *list
.getIndex();
345 if (numMaybe
>0) value
<< pInfoDict
->pos
<< "~";
346 else value
<< pInfoDict
->pos
;
351 else value
<< "UNKNOWN"; //is unknown word
353 else value
<< EMPTY_POS
;
355 pila
.push(value
.str());
360 * void winPushMFTFeature(void *ptr, dictionary *d, std::stack *pila, int direction)
361 * Genera el atributo con la "Most Frequent Tag", la etiqueta mas frecuente.
362 * Recibe como parametros:
363 * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list)
364 * aunque se recibe como un void*.
365 * d, es el diccionario con el que estamos trabajarando
366 * pila,es la pila donde apilaremos el atributo generado
367 * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT
370 void swindow::winPushMFTFeature(void* ptr
, dictionary
* d
, std::stack
<std::string
>& pila
, int direction
)
374 nodo_feature_list
*p
= (nodo_feature_list
*)ptr
;
379 num
= *p
->l
.getIndex();
382 pn
= get(*num
, direction
);
385 dataDict
* w
= d
->getElement(pn
->wrd
);
386 if ((long)w
!=HASH_FAIL
)
388 simpleList
<infoDict
*>& list
= d
->getElementMaybe(w
);
389 int numMaybe
= d
->getElementNumMaybe(w
);
393 pInfoDict
= *list
.getIndex();
395 if (pInfoDict
->num
>max
) mft
= pInfoDict
->pos
;
401 else value
+= "UNKNOWN"; //is unknown word
403 else value
+= EMPTY_POS
;
410 * void winPushMaybeFeature(void *ptr, dictionary *d, std::stack *pila, int direction)
411 * Genera tantos atributos "maybe" como posibles POS pueda tener la palabra, y los
413 * Recibe como parametros:
414 * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list)
415 * aunque se recibe como un void*.
416 * d, es el diccionario con el que estamos trabajarando
417 * pila,es la pila donde apilaremos el atributo generado
418 * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT
421 void swindow::winPushMaybeFeature(void* ptr
, dictionary
* d
, std::stack
<std::string
>& pila
, int direction
)
424 std::ostringstream txt
;
425 nodo_feature_list
*p
= (nodo_feature_list
*)ptr
;
430 num
= *p
->l
.getIndex();
431 txt
<< p
->mark
<< *num
<< "~";
432 pn
= get(*num
, direction
);
435 dataDict
* w
= d
->getElement(pn
->wrd
);
437 if ((long)w
!=HASH_FAIL
)
439 simpleList
<infoDict
*>& list
= d
->getElementMaybe(w
);
444 pInfoDict
= *list
.getIndex();
445 feature
+= txt
.str() + pInfoDict
->pos
+ ":1";
454 feature
+= txt
.str() + "UNKNOWN:1";
461 feature
+= txt
.str() + EMPTY_POS
+ ":1";
468 * void winPushPosFeature(void *ptr, dictionary *d, std::stack *pila, int direction)
469 * Genera un atributo con la POS de algunos elementos de la ventana.
470 * Recibe como parametros:
471 * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list)
472 * aunque se recibe como un void*.
473 * d, es el diccionario con el que estamos trabajarando
474 * pila,es la pila donde apilaremos el atributo generado
475 * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT
478 void swindow::winPushPosFeature(void* ptr
, dictionary
* d
, std::stack
<std::string
>& pila
, int direction
)
482 nodo_feature_list
*p
= (nodo_feature_list
*)ptr
;
492 num
= *p
->l
.getIndex();
494 pn
= get(*num
, direction
);
496 if (pn
==NULL
) txt
= EMPTY_POS
;
497 else if ( (pn
->pos
== EMPTY
) || (*num
==0) ) //AKI3
500 dataDict
* w
= d
->getElement(pn
->wrd
);
502 if ((long)w
!=HASH_FAIL
)
504 simpleList
<infoDict
*>& list
= d
->getElementMaybe(w
);
505 int numMaybe
= d
->getElementNumMaybe(w
);
511 pInfoDict
= *list
.getIndex();
513 if (numMaybe
>0) txt
+= pInfoDict
->pos
+ "_";
514 else txt
+= pInfoDict
->pos
;
519 else txt
= "UNKNOWN"; //is unknown word
521 else txt
= pn
->pos
; //AKI3
523 if (value
.empty()) value
= txt
; //AKI3
524 else value
+= "~" + txt
;
530 feature
= ":" + value
;
531 //std::cerr << feature << std::endl;
536 * void winPushPOSFeature(void *ptr, dictionary *d, std::stack *pila, int direction)
537 * Genera un atributo con la palabra de algunos elementos de la ventana.
538 * Recibe como parametros:
539 * ptr, que es un puntero a un nodo de la lista de atributos (nodo_feature_list)
540 * aunque se recibe como un void*.
541 * d, es el diccionario con el que estamos trabajarando
542 * pila,es la pila donde apilaremos el atributo generado
543 * direction, es la direccion en que estamos recorriendo el corpus (LEFT_TO_RIGHT
546 void swindow::winPushWordFeature(void* ptr
, dictionary
* /*d*/, std::stack
<std::string
>& pila
, int direction
)
549 std::ostringstream name
;
551 nodo_feature_list
*p
= (nodo_feature_list
*)ptr
;
554 int *num
= *p
->l
.getIndex();
555 pn
= get(*num
, direction
);
557 if (pn
==NULL
) value
= EMPTY_WORD
;
558 else value
= pn
->wrd
;
559 name
<< std::string(p
->mark
) << *num
;
563 num
= *p
->l
.getIndex();
566 pn
= get(*num
, direction
);
568 if (pn
==NULL
) txt
= EMPTY_WORD
;
573 name
<< ":" << value
;
575 pila
.push(name
.str());
579 /****************************************************************************/
582 int swindow::sentenceLength()
584 //Retorna el número de palabras que tiene la frase cargada en este objeto
590 * Elimina todas las palabras existentes en la ventana
591 * Retorna el número de elementos que poseia la ventana
593 void swindow::deleteList()
595 // std::cerr << "swindow::deleteList " << numObj << " elements" << std::endl;
596 if (first
==0) return;
599 while (first
->next
!=0)
602 // std::cerr << "swindow::deleteList delete " << i << "th element: " << &(first->strScores) << " " << first->strScores << std::endl;
603 assert(first
->next
->previous
== first
);
605 delete first
->previous
;
610 // std::cerr << "swindow::deleteList delete " << i << "th element" << std::endl;
621 void swindow::init(dictionary
* dic
)
627 int swindow::iniGeneric(dictionary
* dic
)
632 int ret
= iniList(dic
);
634 if (ret
>=0) readSentence(dic
);
636 if (ret
==-1) return -1;
637 else if (ret
==0) posEnd
= posIndex
+last
->ord
;
638 else posEnd
=posIndex
+ret
;
645 int swindow::iniList(dictionary
* dic
)
649 for(j
=posIndex
; ((j
<lengthWin
) && (ret
>=0)); j
++) ret
= readInput(dic
);
652 // 0 if end of sentence
653 // -1 if there aren't words
655 if (ret
>=0) ret
=j
-posIndex
-1;
660 void swindow::setWindow(const std::vector
<nodo
*>& user_window
) {
661 this->user_window
= user_window
;
665 /****************************************************************************/
667 int swindow::readSentence(dictionary
* dic
)
670 while (ret
>=0) ret
= readInput(dic
);
675 int swindow::readInput(dictionary
* dic
) {
676 std::string word
, comment
;
677 std::set
<std::string
> tagset
;
680 while((ret
= m_reader
.parseWord(word
, tagset
, comment
)) == 1);
681 nodo
* node
= m_reader
.buildNode(word
, comment
);
684 if(!tagset
.empty()) dic
->addBackupEntry(word
, tagset
);
690 /****************************************************************************/
692 void swindow::winAdd(nodo
*aux
)
694 // add the node in the software window
703 aux
->previous
= last
;
712 /****************************************************************************/
719 swindow::swindow(istream
& in
, std::ostream
* output
, dictionary
* dic
) : m_output(output
), m_reader(in
),
720 first(0), last(0), numObj(0),index(0),beginWin(0),endWin(0),posBegin(0),posEnd(0)
728 swindow::swindow(int lengthWin
, dictionary
*dic
): m_output(NULL
), m_reader(),
729 first(0), last(0), numObj(0), beginWin(0), endWin(0), posBegin(0), posEnd(lengthWin
)
731 this->lengthWin
= lengthWin
;
736 swindow::swindow(istream
& in
, int number
, int position
, std::ostream
* output
, dictionary
* dic
) : m_output(output
), m_reader(in
),
737 first(0), last(0), numObj(0),index(0),beginWin(0),endWin(0),posBegin(0),posEnd(0)
740 if ((number
<3) || (number
<=position
))
741 { fprintf(stderr
,"\nWindow Length can not be first or last element.\nLength should be greater than \"Interest Point Position\" or 3.\n");
746 posIndex
= position
-1;
751 swindow::swindow(istream
& in
, int number
, std::ostream
* output
, dictionary
* dic
) : m_output(output
), m_reader(in
),
752 first(0), last(0), numObj(0),index(0),beginWin(0),endWin(0),posBegin(0),posEnd(0)
761 /****************************************************************************/
763 /* Move Interest Point to next element */
767 if (endWin
->next
!=0) ret
=true;
769 if ((index
==0) || (index
->next
==0)) return false;
770 if ((posIndex
>=posEnd
) && (!ret
)) return false;
772 if ((posIndex
<posEnd
) && (!ret
)) posEnd
--;
773 if ((posEnd
==lengthWin
-1) && (ret
)) endWin
= endWin
->next
;
775 if (posBegin
==0) beginWin
= beginWin
->next
;
776 else if ((posIndex
>=posBegin
) && (posBegin
>0)) posBegin
--;
782 /****************************************************************************/
784 /* Move Interest Point to previous element */
785 bool swindow::previous()
787 if ((index
==NULL
) || (index
->previous
==NULL
)) return false;
789 if ((posBegin
==0) && (beginWin
->previous
!=NULL
)) beginWin
= beginWin
->previous
;
790 else if (posIndex
>posBegin
) posBegin
++;
792 if (posEnd
<lengthWin
-1) posEnd
++;
793 else endWin
= endWin
->previous
;
795 index
= index
->previous
;
799 /****************************************************************************/
801 /* Get Interest Point */
802 nodo
*swindow::getIndex()
807 /****************************************************************************/
809 nodo
*swindow::get(int position
, int direction
)
811 if (direction
==2) position
= -position
;
812 if ( ((position
<0) && (posIndex
+position
+1<posBegin
))
813 || ((position
>0) && (posIndex
+position
>posEnd
)) )
816 if(!user_window
.empty())
817 return get_user(position
);
819 return get_intern(position
);
822 // position must be valid, ie. "in the window"
823 nodo
*swindow::get_user(int position
)
825 return user_window
[position
+2];
828 nodo
*swindow::get_intern(int position
)
833 if (position
== 0) return index
;
839 if (aux
->next
!= NULL
) aux
= aux
->next
;
844 if (aux
->previous
!= NULL
) aux
= aux
->previous
;
853 /****************************************************************************/
855 int swindow::show(int showScoresFlag
, int showComments
)
859 if (first
==NULL
) return 0;
861 nodo
*actual
= first
;
863 while(actual
!= NULL
) {
864 *m_output
<< actual
->realWrd
<< " " << actual
->pos
;
866 if ( showScoresFlag
== TRUE
&& !actual
->strScores
.empty() ) {
867 *m_output
<< " " << actual
->strScores
;
870 if ( showComments
== TRUE
&& !actual
->comment
.empty() ) {
871 *m_output
<< " " << actual
->comment
;
874 *m_output
<< std::endl
;
876 actual
= actual
->next
;
882 /****************************************************************************/
884 void swindow::putLengthWin(int l
)
889 /****************************************************************************/
891 void swindow::putIndex(int i
)
896 /****************************************************************************/
899 * Modifica el valor de los pesos para una palabra
901 * action = 0 --> Pone el peso máximo (put max score)
902 * action = 1 --> Inicializa los pesos (reset values)
903 * action = 2 --> Restaura el valor de la vuelta anterior(last lap value)
905 int swindow::winMaterializePOSValues(int action
)
907 if (first
==NULL
) return 0;
910 weight_node_t
*w
,max
;
920 while(!actual
->stackScores
.empty())
922 w
= actual
->stackScores
.top();
923 actual
->stackScores
.pop();
925 if (inicio
|| w
->data
>max
.data
)
933 actual
->weight
=max
.data
;
934 actual
->pos
= max
.pos
;
935 //Added for 2 laps tagging
936 actual
->weightOld
=max
.data
;
937 actual
->posOld
= max
.pos
;
939 case 1: //RESET VALUES
944 actual
->pos
= actual
->posOld
;
945 actual
->weight
=actual
->weightOld
;
953 /****************************************************************************/
956 * int winExistUnkWord(int direction, dictionary *d)
957 * Esta funcion comprueba si hay parabras desconocidas.
958 * En caso de que el parametro direction sea:
959 * LEFT_TO_RIGHT - mira si hay desconocidas a la
960 * derecha del punto de interes de la ventana.
961 * RIGHT_TO_LEFT - mira si hay desconocidas a la izquierda
962 * del punto de interes de la ventana.
963 * Esta funcion devuelve:
964 * un entero >=0, si no hay desconocidas
965 * -1, si hay desconocidas
967 int swindow::winExistUnkWord(int direction
, dictionary
*d
)
970 int ret
=0,i
=posIndex
;
972 if (index
==NULL
) return 1;
980 if (aux
->next
==NULL
|| aux
==endWin
) ret
=-1;
981 else aux
= aux
->next
;
982 if ((long)d
->getElement(aux
->wrd
)==HASH_FAIL
) return -1;
986 if (aux
->previous
==NULL
|| aux
==beginWin
) ret
=-1;
987 else aux
= aux
->previous
;
988 if ((long)d
->getElement(aux
->wrd
)==HASH_FAIL
) return -1;