1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: mythes.cxx,v $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
32 // MARKER(update_precomp.py): autogen include statement, do not remove
33 #include "precompiled_lingucomponent.hxx"
34 #include "license.readme"
44 MyThes::MyThes(const char* idxpath
, const char * datpath
)
51 if (thInitialize(idxpath
, datpath
) != 1) {
52 fprintf(stderr
,"Error - can't open %s or %s\n",idxpath
, datpath
);
55 // did not initialize properly - throw exception?
66 int MyThes::thInitialize(const char* idxpath
, const char* datpath
)
69 // open the index file
70 FILE * pifile
= fopen(idxpath
,"r");
75 // parse in encoding and index size */
77 wrd
= (char *)calloc(1, MAX_WD_LEN
);
79 fprintf(stderr
,"Error - bad memory allocation\n");
84 int len
= readLine(pifile
,wrd
,MAX_WD_LEN
);
85 encoding
= mystrdup(wrd
);
86 len
= readLine(pifile
,wrd
,MAX_WD_LEN
);
87 int idxsz
= atoi(wrd
);
90 // now allocate list, offst for the given size
91 list
= (char**) calloc(idxsz
,sizeof(char*));
92 offst
= (unsigned int*) calloc(idxsz
,sizeof(unsigned int));
94 if ( (!(list
)) || (!(offst
)) ) {
95 fprintf(stderr
,"Error - bad memory allocation\n");
101 // now parse the remaining lines of the index
102 len
= readLine(pifile
,wrd
,MAX_WD_LEN
);
105 int np
= mystr_indexOfChar(wrd
,'|');
109 list
[nw
] = (char *)calloc(1,(np
+1));
111 fprintf(stderr
,"Error - bad memory allocation\n");
116 memcpy((list
[nw
]),wrd
,np
);
117 offst
[nw
] = atoi(wrd
+np
+1);
121 len
= readLine(pifile
,wrd
,MAX_WD_LEN
);
127 /* next open the data file */
128 pdfile
= fopen(datpath
,"r");
137 void MyThes::thCleanup()
139 /* first close the data file */
147 /* now free up all the allocated strings on the list */
148 for (int i
=0; i
< nw
; i
++)
158 if (encoding
) free((void*)encoding
);
159 if (offst
) free((void*)offst
);
169 // lookup text in index and count of meanings and a list of meaning entries
170 // with each entry having a synonym count and pointer to an
171 // array of char * (i.e the synonyms)
173 // note: calling routine should call CleanUpAfterLookup with the original
174 // meaning point and count to properly deallocate memory
176 int MyThes::Lookup(const char * pText
, int len
, mentry
** pme
)
181 // handle the case of missing file or file related errors
182 if (! pdfile
) return 0;
186 /* copy search word and make sure null terminated */
187 char * wrd
= (char *) calloc(1,(len
+1));
188 memcpy(wrd
,pText
,len
);
190 /* find it in the list */
191 int idx
= nw
> 0 ? binsearch(wrd
,list
,nw
) : -1;
193 if (idx
< 0) return 0;
195 // now seek to the offset
196 offset
= (long) offst
[idx
];
197 int rc
= fseek(pdfile
,offset
,SEEK_SET
);
202 // grab the count of the number of meanings
203 // and allocate a list of meaning entries
205 buf
= (char *) malloc( MAX_LN_LEN
);
207 readLine(pdfile
, buf
, (MAX_LN_LEN
-1));
208 int np
= mystr_indexOfChar(buf
,'|');
213 int nmeanings
= atoi(buf
+np
+1);
214 *pme
= (mentry
*) malloc( nmeanings
* sizeof(mentry
) );
220 // now read in each meaning and parse it to get defn, count and synonym lists
222 char dfn
[MAX_WD_LEN
];
224 for (int j
= 0; j
< nmeanings
; j
++) {
225 readLine(pdfile
, buf
, (MAX_LN_LEN
-1));
231 // store away the part of speech for later use
234 np
= mystr_indexOfChar(p
,'|');
243 // count the number of fields in the remaining line
246 np
= mystr_indexOfChar(d
,'|');
250 np
= mystr_indexOfChar(d
,'|');
253 pm
->psyns
= (char **) malloc(nf
*sizeof(char*));
255 // fill in the synonym list
257 for (int jj
= 0; jj
< nf
; jj
++)
259 np
= mystr_indexOfChar(d
,'|');
263 pm
->psyns
[jj
] = mystrdup(d
);
268 pm
->psyns
[jj
] = mystrdup(d
);
272 // add pos to first synonym to create the definition
274 int m
= strlen(pm
->psyns
[0]);
275 if ((k
+m
) < (MAX_WD_LEN
- 1)) {
278 strncpy((dfn
+k
+1),(pm
->psyns
[0]),m
+1);
279 pm
->defn
= mystrdup(dfn
);
281 pm
->defn
= mystrdup(pm
->psyns
[0]);
294 void MyThes::CleanUpAfterLookup(mentry
** pme
, int nmeanings
)
297 if (nmeanings
== 0) return;
298 if ((*pme
) == NULL
) return;
302 for (int i
= 0; i
< nmeanings
; i
++) {
303 int count
= pm
->count
;
304 for (int j
= 0; j
< count
; j
++) {
305 if (pm
->psyns
[j
]) free(pm
->psyns
[j
]);
308 if (pm
->psyns
) free(pm
->psyns
);
310 if (pm
->defn
) free(pm
->defn
);
322 // read a line of text from a text file stripping
323 // off the line terminator and replacing it with
324 // a null string terminator.
325 // returns: -1 on error or the number of characters in
326 // in the returning string
328 // A maximum of nc characters will be returned
330 int MyThes::readLine(FILE * pf
, char * buf
, int nc
)
333 if (fgets(buf
,nc
,pf
)) {
342 // performs a binary search on null terminated character
345 // returns: -1 on not found
346 // index of wrd in the list[]
348 int MyThes::binsearch(char * sw
, char* _list
[], int nlst
)
350 int lp
, up
, mp
, j
, indx
;
354 if (strcmp(sw
,_list
[lp
]) < 0) return -1;
355 if (strcmp(sw
,_list
[up
]) > 0) return -1;
357 mp
= (int)((lp
+up
) >> 1);
358 j
= strcmp(sw
,_list
[mp
]);
366 if (lp
> up
) return -1;
371 char * MyThes::get_th_encoding()
373 if (encoding
) return encoding
;
378 // string duplication routine
379 char * MyThes::mystrdup(const char * p
)
381 int sl
= strlen(p
) + 1;
382 char * d
= (char *)malloc(sl
);
390 // remove cross-platform text line end characters
391 void MyThes::mychomp(char * s
)
394 if ((k
> 0) && ((*(s
+k
-1)=='\r') || (*(s
+k
-1)=='\n'))) *(s
+k
-1) = '\0';
395 if ((k
> 1) && (*(s
+k
-2) == '\r')) *(s
+k
-2) = '\0';
399 // return index of char in string
400 int MyThes::mystr_indexOfChar(const char * d
, int c
)
402 char * p
= strchr((char *)d
,c
);
403 if (p
) return (int)(p
-d
);