update dev300-m57
[ooovba.git] / libtextcat / libtextcat-2.2.patch
blobc9ce4add875cdf5a9e30e48db93538210d958e1a
1 --- misc/libtextcat-2.2/configure Thu May 22 13:39:55 2003
2 +++ misc/build/libtextcat-2.2/configure Mon Mar 31 11:29:14 2008
3 @@ -5391,7 +5391,8 @@
4 allow_undefined_flag=
5 no_undefined_flag=
6 need_lib_prefix=unknown
7 -need_version=unknown
8 +#need_version=unknown
9 +need_version=no
10 # when you set need_version to no, make sure it does not cause -set_version
11 # flags to be left without arguments
12 archive_cmds=
13 @@ -5785,7 +5786,7 @@
14 # cross-compilation, but unfortunately the echo tests do not
15 # yet detect zsh echo's removal of \ escapes. Also zsh mangles
16 # `"' quotes if we put them in here... so don't!
17 - archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$linker_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)'
18 + archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$compiler_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)'
19 # We need to add '_' to the symbols in $export_symbols first
20 #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols'
21 hardcode_direct=yes
22 @@ -6280,7 +6281,7 @@
25 freebsd*)
26 - objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout`
27 + objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo elf`
28 version_type=freebsd-$objformat
29 case $version_type in
30 freebsd-elf*)
31 --- misc/libtextcat-2.2/src/Makefile.in Thu May 22 13:39:52 2003
32 +++ misc/build/libtextcat-2.2/src/Makefile.in Mon Mar 31 11:29:14 2008
33 @@ -124,20 +124,20 @@
34 target_vendor = @target_vendor@
35 AUTOMAKE_OPTIONS = 1.4 foreign
37 -WARNS = -W -Wall -Wshadow -Wpointer-arith
38 -IFLAGS =
39 -FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
40 +#WARNS = -W -Wall -Wshadow -Wpointer-arith
41 +IFLAGS =
42 +#FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
43 VERBOSE = -DVERBOSE
44 AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
45 AM_LDFLAGS = -g
47 noinst_HEADERS = \
48 - common.h constants.h fingerprint.h textcat.h wg_mempool.h
49 + common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h
52 lib_LTLIBRARIES = libtextcat.la
53 libtextcat_la_SOURCES = \
54 - common.c fingerprint.c textcat.c wg_mempool.c
55 + common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c
58 bin_PROGRAMS = createfp
59 @@ -156,7 +156,7 @@
60 libtextcat_la_LDFLAGS =
61 libtextcat_la_LIBADD =
62 am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \
63 - wg_mempool.lo
64 + wg_mempool.lo utf8misc.lo
65 libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS)
66 bin_PROGRAMS = createfp$(EXEEXT)
67 noinst_PROGRAMS = testtextcat$(EXEEXT)
68 @@ -177,7 +177,8 @@
69 @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \
70 @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \
71 @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \
72 -@AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo
73 +@AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo \
74 +@AMDEP_TRUE@ ./$(DEPDIR)/utf8misc.Plo
75 COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
76 $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
77 LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \
78 @@ -213,7 +214,7 @@
79 @rm -f stamp-h1
80 cd $(top_builddir) && $(SHELL) ./config.status src/config.h
82 -$(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4)
83 +$(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4)
84 cd $(top_srcdir) && $(AUTOHEADER)
85 touch $(srcdir)/config.h.in
87 @@ -247,8 +248,8 @@
88 echo "rm -f \"$${dir}/so_locations\""; \
89 rm -f "$${dir}/so_locations"; \
90 done
91 -libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES)
92 - $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS)
93 +libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES)
94 + $(LINK) -avoid-version -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS)
95 binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
96 install-binPROGRAMS: $(bin_PROGRAMS)
97 @$(NORMAL_INSTALL)
98 @@ -285,10 +286,10 @@
99 echo " rm -f $$p $$f"; \
100 rm -f $$p $$f ; \
101 done
102 -createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES)
103 +createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES)
104 @rm -f createfp$(EXEEXT)
105 $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS)
106 -testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES)
107 +testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES)
108 @rm -f testtextcat$(EXEEXT)
109 $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS)
111 @@ -304,6 +305,7 @@
112 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/testtextcat.Po@am__quote@
113 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@
114 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wg_mempool.Plo@am__quote@
115 +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8misc.Plo@am__quote@
117 distclean-depend:
118 -rm -rf ./$(DEPDIR)
119 --- misc/libtextcat-2.2/src/common.c Thu May 22 13:32:43 2003
120 +++ misc/build/libtextcat-2.2/src/common.c Mon Mar 31 11:29:14 2008
121 @@ -3,23 +3,23 @@
123 * Copyright (c) 2003, WiseGuys Internet B.V.
124 * All rights reserved.
125 - *
127 * Redistribution and use in source and binary forms, with or without
128 * modification, are permitted provided that the following conditions
129 * are met:
130 - *
132 * - Redistributions of source code must retain the above copyright
133 * notice, this list of conditions and the following disclaimer.
134 - *
136 * - Redistributions in binary form must reproduce the above copyright
137 * notice, this list of conditions and the following disclaimer in the
138 * documentation and/or other materials provided with the
139 * distribution.
140 - *
142 * - Neither the name of the WiseGuys Internet B.V. nor the names of
143 * its contributors may be used to endorse or promote products derived
144 * from this software without specific prior written permission.
145 - *
147 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
148 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
149 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
150 @@ -114,11 +114,11 @@
151 wgmem_error( "Error while strduping %u bytes.\n", strlen(s) );
154 - return( result );
155 + return( result );
158 -extern void* wg_realloc( void *ptr, size_t size )
160 +extern void* wg_realloc( void *ptr, size_t size )
162 void *result;
164 if (!size) {
165 @@ -131,7 +131,7 @@
166 wgmem_error( "Error while reallocing %u bytes.\n", size );
169 - return( result );
170 + return( result );
173 extern void wg_free( void *mem )
174 @@ -148,12 +148,12 @@
175 if ( fgets(line, size, fp) == NULL ) {
176 return NULL;
180 /** kill term null **/
181 if ( (p = strpbrk( line, "\n\r" )) ) {
182 *p = '\0';
183 - }
187 return line;
190 @@ -164,39 +164,39 @@
192 * ARGUMENTS:
193 * - result:
194 - *
196 * After the split, this array contains pointers to the start of each
197 * detected segment. Must be preallocated and at least as large as
198 * maxsegments. The pointers point into the dest buffer.
199 - *
200 - * - dest:
201 - *
203 + * - dest:
205 * String into which result points as an index. Must be preallocated, and
206 * at least as big as src. You can use src as dest, but in that case src
207 * is overwritten!
208 - *
209 - * - src:
210 - *
212 + * - src:
214 * The string to split. Sequences of whitespace are treated as separators, unless
215 * escaped. There are two ways to escape: by using single quotes (anything
216 * between single quotes is treated as one segment), or by using a backslash
217 * to escape the next character. The backslash escape works inside quotation
218 * as well.
219 - *
221 * Example:
222 - *
224 * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into:
225 - *
227 * "It's"
228 * "very easy"
229 * "to use WiseGuys' wg_split()"
230 * "function"
231 - *
232 - * - maxsegments:
233 - *
235 + * - maxsegments:
237 * The maximum number of segments. If the splitter runs out of segments,
238 * the remainder of the string is stored in the last segment.
239 - *
241 * RETURN VALUE:
242 * The number of segments found.
244 @@ -218,12 +218,12 @@
245 switch (state) {
246 case 0:
247 /*** Skip spaces ***/
248 - while ( isspace((int) *p) ) {
249 + while ( isspace((unsigned char) *p) ) {
250 p++;
252 state = 1;
254 - case 1:
255 + case 1:
256 /*** Start segment ***/
257 result[cnt] = w;
258 cnt++;
259 @@ -232,12 +232,12 @@
260 case 2:
261 /*** Unquoted segment ***/
262 while (*p) {
263 - if ( isspace((int) *p) ) {
264 + if ( isspace((unsigned char) *p) ) {
265 *w++ = '\0';
266 p++;
267 state = 0;
268 break;
269 - }
271 else if ( *p == '\'' ) {
272 /*** Start quotation ***/
273 p++;
274 @@ -292,17 +292,17 @@
278 +#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
279 extern void wg_timerstart(wgtimer_t *t)
281 -#ifdef HAVE_GETTIMEOFDAY
282 gettimeofday( &(t->start), NULL );
283 -#endif
285 +#endif /* TL : no struct timeval under Win32 */
288 +#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
289 extern uint4 wg_timerstop(wgtimer_t *t)
291 -#ifdef HAVE_GETTIMEOFDAY
292 uint4 result;
293 gettimeofday( &(t->stop), NULL );
294 result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 +
295 @@ -312,25 +312,23 @@
296 t->start.tv_usec = t->stop.tv_usec;
298 return result;
299 -#else
300 - return 0;
301 -#endif
303 +#endif /* TL : no struct timeval under Win32 */
307 * wg_strgmov -- a guarded strcpy() variation
308 - *
310 * copies src to dest (including terminating zero), and returns
311 * pointer to position of terminating zero in dest. The function is
312 * guaranteed not to write past destlimit. If the copy couldn't be
313 - * finished, the function returns NULL after restoring the first
314 - * character in dest for your convenience (since this is usually a zero).
315 + * finished, the function returns NULL after restoring the first
316 + * character in dest for your convenience (since this is usually a zero).
318 char *wg_strgmov( char *dest, const char *src, const char *destlimit )
320 char tmp, *w;
323 if ( !dest || dest >= destlimit ) {
324 return NULL;
326 @@ -355,7 +353,7 @@
330 - * wg_trim() -- remove whitespace surrounding a string.
331 + * wg_trim() -- remove whitespace surrounding a string.
333 * Example: " bla bla bla " becomes "bla bla bla" after trimming.
335 @@ -373,12 +371,12 @@
336 char *lastnonspace = &dest[-1];
337 const char *p = src;
338 char *w = dest;
340 - while ( isspace((int)*p) ) {
342 + while ( isspace((unsigned char)*p) ) {
343 p++;
345 while (*p) {
346 - if ( !isspace((int)*p) ) {
347 + if ( !isspace((unsigned char)*p) ) {
348 lastnonspace = w;
350 *w++ = *p++;
351 --- misc/libtextcat-2.2/src/common.h Thu May 22 15:02:29 2003
352 +++ misc/build/libtextcat-2.2/src/common.h Mon Mar 31 11:29:14 2008
353 @@ -1,28 +1,28 @@
354 #ifndef _COMMON_H_
355 #define _COMMON_H_
357 - * common.h -- a mixed bag of helper functions
358 + * common.h -- a mixed bag of helper functions
360 * Copyright (C) 2003 WiseGuys Internet B.V.
362 * THE BSD LICENSE
363 - *
365 * Redistribution and use in source and binary forms, with or without
366 * modification, are permitted provided that the following conditions
367 * are met:
368 - *
370 * - Redistributions of source code must retain the above copyright
371 * notice, this list of conditions and the following disclaimer.
372 - *
374 * - Redistributions in binary form must reproduce the above copyright
375 * notice, this list of conditions and the following disclaimer in the
376 * documentation and/or other materials provided with the
377 * distribution.
378 - *
380 * - Neither the name of the WiseGuys Internet B.V. nor the names of
381 * its contributors may be used to endorse or promote products derived
382 * from this software without specific prior written permission.
383 - *
385 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
386 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
387 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
388 @@ -86,10 +86,12 @@
389 typedef char boole;
390 #endif
392 +#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
393 typedef struct wgtimer_s {
394 struct timeval start;
395 struct timeval stop;
396 } wgtimer_t;
397 +#endif /* TL : no struct timeval under Win32 */
400 extern void *wg_malloc( size_t size );
401 @@ -101,13 +103,15 @@
403 extern char *wg_getline( char *line, int size, FILE *fp );
405 +#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
406 extern void wg_timerstart(wgtimer_t *t);
407 extern uint4 wg_timerstop(wgtimer_t *t);
408 +#endif /* TL : no struct timeval under Win32 */
410 extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments );
411 extern char *wg_strgmov( char *dest, const char *src, const char *destlimit );
412 extern char *wg_trim( char *dest, const char *src );
416 #endif
418 --- misc/libtextcat-2.2/src/constants.h Thu May 22 13:32:43 2003
419 +++ misc/build/libtextcat-2.2/src/constants.h Mon Mar 31 11:29:14 2008
420 @@ -39,6 +39,8 @@
422 #include <limits.h>
424 +#define _UTF8_
426 #define DESCRIPTION "out of place"
428 /* Reported matches are those fingerprints with a score less than best
429 @@ -59,14 +61,21 @@
430 /* Maximum number of n-grams in a fingerprint */
431 #define MAXNGRAMS 400
433 -/* Maximum size of an n-gram? */
434 -#define MAXNGRAMSIZE 5
435 +/* Maximum number of character of an n-gram? */
436 +#define MAXNGRAMSYMBOL 5
438 +/* Maximum size of the string representing an n-gram (must be greater than number of symbol) */
439 +#ifdef _UTF8_
440 +#define MAXNGRAMSIZE 20
441 +#else
442 +#define MAXNGRAMSIZE MAXNGRAMSYMBOL
443 +#endif
445 /* Which characters are not acceptable in n-grams? */
446 -#define INVALID(c) (isspace((int)c) || isdigit((int)c))
447 +#define INVALID(c) (isspace((unsigned char)c) || isdigit((unsigned char)c))
449 /* Minimum size (in characters) for accepting a document */
450 -#define MINDOCSIZE 25
451 +#define MINDOCSIZE 6
453 /* Maximum penalty for missing an n-gram in fingerprint */
454 #define MAXOUTOFPLACE 400
455 @@ -75,5 +84,8 @@
456 #define TABLEPOW 13
458 #define MAXSCORE INT_MAX
460 +/* where the fingerprints files are stored */
461 +#define DEFAULT_FINGERPRINTS_PATH ""
463 #endif
464 --- misc/libtextcat-2.2/src/fingerprint.c Thu May 22 13:32:43 2003
465 +++ misc/build/libtextcat-2.2/src/fingerprint.c Mon Mar 31 11:29:14 2008
466 @@ -6,23 +6,23 @@
467 * All rights reserved.
469 * THE BSD LICENSE
470 - *
472 * Redistribution and use in source and binary forms, with or without
473 * modification, are permitted provided that the following conditions
474 * are met:
475 - *
477 * - Redistributions of source code must retain the above copyright
478 * notice, this list of conditions and the following disclaimer.
479 - *
481 * - Redistributions in binary form must reproduce the above copyright
482 * notice, this list of conditions and the following disclaimer in the
483 * documentation and/or other materials provided with the
484 * distribution.
485 - *
487 * - Neither the name of the WiseGuys Internet B.V. nor the names of
488 * its contributors may be used to endorse or promote products derived
489 * from this software without specific prior written permission.
490 - *
492 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
493 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
494 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
495 @@ -51,7 +51,7 @@
496 * The reason why we go through the trouble of doing a partial
497 * (heap)sort is that a full quicksort behaves horribly on the data:
498 * most n-grams have a very low count, resulting in a data set in
499 - * nearly-sorted order. This causes quicksort to behave very badly.
500 + * nearly-sorted order. This causes quicksort to behave very badly.
501 * Heapsort, on the other hand, behaves handsomely: worst case is
502 * Mlog(N) for M n-grams filtered through a N-sized heap.
504 @@ -63,6 +63,10 @@
505 * - put table/heap datastructure in a separate file.
508 +#ifndef _UTF8_
509 +#define _UTF8_
510 +#endif
512 #include "config.h"
513 #include <stdio.h>
514 #ifdef HAVE_STDLIB_H
515 @@ -80,10 +84,12 @@
516 #include "wg_mempool.h"
517 #include "constants.h"
519 +#include "utf8misc.h"
521 #define TABLESIZE (1<<TABLEPOW)
522 #define TABLEMASK ((TABLESIZE)-1)
525 typedef struct {
527 sint2 rank;
528 @@ -96,7 +102,7 @@
529 const char *name;
530 ngram_t *fprint;
531 uint4 size;
534 } fp_t;
536 typedef struct entry_s {
537 @@ -105,13 +111,13 @@
538 struct entry_s *next;
539 } entry_t;
541 -typedef struct table_s {
542 +typedef struct table_s {
543 void *pool;
544 entry_t **table;
545 entry_t *heap;
547 struct table_s *next;
550 uint4 heapsize;
551 uint4 size;
552 } table_t;
553 @@ -122,7 +128,7 @@
554 * fast and furious little hash function
556 * (Note that we could use some kind of rolling checksum, and update it
557 - * during n-gram construction)
558 + * during n-gram construction)
560 static uint4 simplehash( const char *p, int len )
562 @@ -134,29 +140,14 @@
566 -/* checks if n-gram lex is a prefix of key and of length len */
567 -inline int issame( char *lex, char *key, int len )
569 - int i;
570 - for (i=0; i<len; i++) {
571 - if ( key[i] != lex[i] ) {
572 - return 0;
575 - if ( lex[i] != 0 ) {
576 - return 0;
578 - return 1;
582 /* increases frequency of ngram(p,len) */
583 -static inline int increasefreq( table_t *t, char *p, int len )
585 - uint4 hash = simplehash( p, len ) & TABLEMASK;
586 +static int increasefreq( table_t *t, char *p, int len )
588 + uint4 hash = simplehash( p, len ) & TABLEMASK;
589 entry_t *entry = t->table[ hash ];
591 - while ( entry ) {
593 + while ( entry ) {
594 if ( issame( entry->str, p, len ) ) {
595 /*** Found it! ***/
596 entry->cnt++;
597 @@ -168,7 +159,7 @@
600 /*** Not found, so create ***/
601 - entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
602 + entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) ));
603 strcpy( entry->str, p );
604 entry->cnt = 1;
606 @@ -181,12 +172,12 @@
607 #if 0
609 /* looks up ngram(p,len) */
610 -static entry_t *findfreq( table_t *t, char *p, int len )
612 - uint4 hash = simplehash( p, len ) & TABLEMASK;
613 +static entry_t *findfreq( table_t *t, char *p, int len )
615 + uint4 hash = simplehash( p, len ) & TABLEMASK;
616 entry_t *entry = t->table[ hash ];
618 - while ( entry ) {
620 + while ( entry ) {
621 if ( issame( entry->str, p, len ) ) {
622 return entry;
624 @@ -219,7 +210,7 @@
625 #define GREATER(x,y) ((x).cnt > (y).cnt)
626 #define LESS(x,y) ((x).cnt < (y).cnt)
628 -inline static void siftup( table_t *t, unsigned int child )
629 +static void siftup( table_t *t, unsigned int child )
631 entry_t *heap = t->heap;
632 unsigned int parent = (child-1) >> 1;
633 @@ -241,7 +232,7 @@
637 -inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
638 +static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
640 entry_t *heap = t->heap;
641 unsigned int child = parent*2 + 1;
642 @@ -273,7 +264,7 @@
643 if (t->size < t->heapsize) {
644 memcpy( &(heap[t->size]), item, sizeof(entry_t));
645 siftup( t, t->size );
646 - t->size++;
647 + t->size++;
648 return 0;
651 @@ -316,18 +307,18 @@
653 /*** Fill result heap ***/
654 for (i=0; i<TABLESIZE; i++) {
655 - entry_t *p = t->table[i];
656 + entry_t *p = t->table[i];
657 while (p) {
658 heapinsert(t, p);
659 p = p->next;
661 - }
663 return 1;
667 static table_t *inittable(uint4 maxngrams)
670 table_t *result = (table_t *)wg_zalloc( sizeof(table_t) );
671 result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE );
672 result->pool = wgmempool_Init( 10000, 10 );
673 @@ -347,14 +338,14 @@
674 wgmempool_Done(t->pool);
675 wg_free(t->table);
676 wg_free(t->heap);
677 - wg_free(t);
678 + wg_free(t);
682 extern void *fp_Init(const char *name)
684 fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) );
687 if ( name ) {
688 h->name = wg_strdup(name);
690 @@ -458,21 +449,27 @@
691 return dest;
695 +/**
696 +* this function extract all n-gram from past buffer and put them into the table "t"
697 +* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice
699 static void createngramtable( table_t *t, const char *buf )
701 char n[MAXNGRAMSIZE+1];
702 const char *p = buf;
703 int i;
704 + int pointer = 0;
706 /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
707 - for (;;p++) {
708 + while(1) {
710 - const char *q = p;
711 + const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/
712 char *m = n;
714 /*** First char may be an underscore ***/
715 - *m++ = *q++;
716 + int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/
717 + q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/
718 + m += decay; /*[modified]*/
719 *m = '\0';
721 increasefreq( t, n, 1 );
722 @@ -482,19 +479,22 @@
725 /*** Let the compiler unroll this ***/
726 - for ( i=2; i<=MAXNGRAMSIZE; i++) {
727 + for ( i=2; i<=MAXNGRAMSYMBOL; i++) {
729 - *m++ = *q;
730 + decay = charcopy(q, m); /*[modified] like above*/
731 + m += decay;
732 *m = '\0';
734 increasefreq( t, n, i );
736 if ( *q == '_' ) break;
737 - q++;
738 + q += decay;
739 if ( *q == '\0' ) {
740 return;
744 + pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/
746 return;
748 @@ -514,7 +514,7 @@
750 ngram_t *x = (ngram_t *)a;
751 ngram_t *y = (ngram_t *)b;
754 return mystrcmp( x->str, y->str );
757 @@ -522,12 +522,12 @@
759 ngram_t *x = (ngram_t *)a;
760 ngram_t *y = (ngram_t *)b;
763 return x->rank - y->rank;
767 - * Create a fingerprint:
768 + * Create a fingerprint:
769 * - record the frequency of each unique n-gram in a hash table
770 * - take the most frequent n-grams
771 * - sort them alphabetically, recording their relative rank
772 @@ -544,20 +544,21 @@
775 /*** Throw out all invalid chars ***/
776 - tmp = prepbuffer( buffer, bufsize );
777 + tmp = prepbuffer( buffer, bufsize );
778 + /*printf("Cleaned buffer : %s\n",tmp);*/
779 if ( tmp == NULL ) {
780 return 0;
783 h = (fp_t*)handle;
784 t = inittable(maxngrams);
785 + /*printf("Table initialized\n");*/
787 /*** Create a hash table containing n-gram counts ***/
788 createngramtable(t, tmp);
790 + /*printf("Table created\n");*/
791 /*** Take the top N n-grams and add them to the profile ***/
792 - table2heap(t);
793 - maxngrams = WGMIN( maxngrams, t->size );
794 + table2heap(t);
795 + maxngrams = WGMIN( maxngrams, t->size );
797 h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams );
798 h->size = maxngrams;
799 @@ -568,7 +569,7 @@
800 entry_t tmp2;
802 heapextract(t, &tmp2);
805 /*** the string and its rank is all we need ***/
806 strcpy( h->fprint[i].str, tmp2.str );
807 h->fprint[i].rank = i;
808 @@ -578,7 +579,7 @@
809 wg_free(tmp);
811 /*** Sort n-grams alphabetically, for easy comparison ***/
812 - qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
813 + qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
814 return 1;
817 @@ -608,7 +609,7 @@
818 #endif
819 return 0;
823 h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));
825 while (cnt < maxngrams && wg_getline(line,1024,fp)) {
826 @@ -635,7 +636,7 @@
827 h->size = cnt;
829 /*** Sort n-grams, for easy comparison later on ***/
830 - qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
831 + qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
833 fclose(fp);
835 @@ -648,14 +649,15 @@
837 uint4 i;
838 fp_t *h = (fp_t *)handle;
839 - ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size );
841 + ngram_t *tmp = (ngram_t*)wg_malloc( sizeof(ngram_t) * h->size );
843 /*** Make a temporary and sort it on rank ***/
844 memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) );
845 - qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank );
846 + qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank );
848 for (i=0; i<h->size; i++) {
849 - fprintf( fp, "%s\n", tmp[i].str );
850 + /*fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank );*/
851 + fprintf( fp, "%s\n", tmp[i].str);
853 wg_free( tmp );
855 @@ -669,7 +671,7 @@
856 uint4 i = 0;
857 uint4 j = 0;
858 sint4 sum = 0;
861 /*** Compare the profiles in mergesort fashion ***/
862 while ( i < c->size && j < u->size ) {
864 @@ -705,7 +707,7 @@
867 return sum;
873 --- misc/libtextcat-2.2/src/fingerprint.h Mon May 19 14:16:31 2003
874 +++ misc/build/libtextcat-2.2/src/fingerprint.h Mon Mar 31 11:29:14 2008
875 @@ -41,7 +41,13 @@
876 extern int fp_Read( void *handle, const char *fname, int maxngrams );
877 extern sint4 fp_Compare( void *cat, void *unknown, int cutoff );
878 extern void fp_Show( void *handle );
879 +#ifdef __cplusplus
880 +extern "C" {
881 +#endif
882 extern const char *fp_Name( void *handle );
883 +#ifdef __cplusplus
885 +#endif
886 extern void fp_Print( void *handle, FILE *fp );
888 #endif
889 --- misc/libtextcat-2.2/src/libtextcat.map Mon Mar 31 11:30:06 2008
890 +++ misc/build/libtextcat-2.2/src/libtextcat.map Mon Mar 31 11:29:14 2008
891 @@ -1 +1,40 @@
892 -dummy
894 + global:
895 + charcopy
896 + issame
897 + nextcharstart
898 + utfstrlen
899 + wgmempool_Done
900 + wgmempool_Init
901 + wgmempool_Reset
902 + wgmempool_alloc
903 + wgmempool_getline
904 + wgmempool_strdup
905 + special_textcat_Init
906 + textcat_Classify
907 + textcat_Done
908 + textcat_Init
909 + textcat_Version
910 + fp_Compare
911 + fp_Create
912 + fp_Debug
913 + fp_Done
914 + fp_Init
915 + fp_Name
916 + fp_Print
917 + fp_Read
918 + heapextract
919 + wg_calloc
920 + wg_free
921 + wg_getline
922 + wg_malloc
923 + wg_split
924 + wg_strdup
925 + wg_strgmov
926 + wg_trim
927 + wg_zalloc
928 + wgmem_error
930 + local:
931 + *;
933 --- misc/libtextcat-2.2/src/makefile.mk Mon Mar 31 11:30:06 2008
934 +++ misc/build/libtextcat-2.2/src/makefile.mk Mon Mar 31 11:29:42 2008
935 @@ -1 +1,90 @@
936 -dummy
937 +#*************************************************************************
939 +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
941 +# Copyright 2008 by Sun Microsystems, Inc.
943 +# OpenOffice.org - a multi-platform office productivity suite
945 +# $RCSfile: libtextcat-2.2.patch,v $
947 +# $Revision: 1.8 $
949 +# This file is part of OpenOffice.org.
951 +# OpenOffice.org is free software: you can redistribute it and/or modify
952 +# it under the terms of the GNU Lesser General Public License version 3
953 +# only, as published by the Free Software Foundation.
955 +# OpenOffice.org is distributed in the hope that it will be useful,
956 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
957 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
958 +# GNU Lesser General Public License version 3 for more details
959 +# (a copy is included in the LICENSE file that accompanied this code).
961 +# You should have received a copy of the GNU Lesser General Public License
962 +# version 3 along with OpenOffice.org. If not, see
963 +# <http://www.openoffice.org/license.html>
964 +# for a copy of the LGPLv3 License.
966 +#*************************************************************************
968 +PRJ = ..$/..$/..$/..$/..
970 +PRJNAME = libtextcat
971 +TARGET = libtextcat
972 +CFLAGSCALL=gsd
974 +USE_DEFFILE=TRUE
975 +EXTERNAL_WARNINGS_NOT_ERRORS := TRUE
977 +.INCLUDE : settings.mk
979 +# --- Files --------------------------------------------------------
981 +# !! not to be compiled because those belong to a stand alone programs: !!
982 +# $(SLO)$/createfp.obj\
983 +# $(SLO)$/testtextcat.obj
985 +SLOFILES= \
986 + $(SLO)$/common.obj\
987 + $(SLO)$/fingerprint.obj\
988 + $(SLO)$/textcat.obj\
989 + $(SLO)$/wg_mempool.obj\
990 + $(SLO)$/utf8misc.obj
992 +#SHL1TARGET= $(TARGET)$(UPD)$(DLLPOSTFIX)
993 +SHL1TARGET= $(TARGET)
995 +SHL1STDLIBS=
997 +# build DLL
998 +SHL1LIBS= $(SLB)$/$(TARGET).lib
999 +SHL1IMPLIB= i$(TARGET)
1000 +SHL1DEPN= $(SHL1LIBS)
1001 +SHL1DEF= $(MISC)$/$(SHL1TARGET).def
1003 +# build DEF file
1004 +DEF1NAME= $(SHL1TARGET)
1005 +DEF1DEPN=$(MISC)$/$(SHL1TARGET).flt
1007 +SHL1VERSIONMAP= libtextcat.map
1009 +# --- Targets ------------------------------------------------------
1011 +.INCLUDE : target.mk
1013 +# copy hand supplied configuration file for Win32 builds to the file
1014 +# which is included in the source code
1015 +$(SLOFILES) : config.h
1016 +config.h :
1017 + $(GNUCOPY) $(OUT)$/misc$/build$/libtextcat-2.2$/src$/win32_config.h $(OUT)$/misc$/build$/libtextcat-2.2$/src$/config.h
1020 +$(MISC)$/$(SHL1TARGET).flt: makefile.mk
1021 + @echo ------------------------------
1022 + @echo Making: $@
1023 + @echo Imp>$@
1024 + @echo __CT>>$@
1025 + @echo _real>>$@
1026 + @echo unnamed>>$@
1027 --- misc/libtextcat-2.2/src/textcat.c Thu May 22 13:32:43 2003
1028 +++ misc/build/libtextcat-2.2/src/textcat.c Mon Mar 31 11:29:14 2008
1029 @@ -4,23 +4,23 @@
1030 * Copyright (C) 2003 WiseGuys Internet B.V.
1032 * THE BSD LICENSE
1033 - *
1035 * Redistribution and use in source and binary forms, with or without
1036 * modification, are permitted provided that the following conditions
1037 * are met:
1038 - *
1040 * - Redistributions of source code must retain the above copyright
1041 * notice, this list of conditions and the following disclaimer.
1042 - *
1044 * - Redistributions in binary form must reproduce the above copyright
1045 * notice, this list of conditions and the following disclaimer in the
1046 * documentation and/or other materials provided with the
1047 * distribution.
1048 - *
1050 * - Neither the name of the WiseGuys Internet B.V. nor the names of
1051 * its contributors may be used to endorse or promote products derived
1052 * from this software without specific prior written permission.
1053 - *
1055 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1056 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1057 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1058 @@ -74,6 +74,7 @@
1059 typedef struct {
1061 void **fprint;
1062 + char *fprint_disable;
1063 uint4 size;
1064 uint4 maxsize;
1066 @@ -112,11 +113,21 @@
1067 fp_Done( h->fprint[i] );
1069 wg_free( h->fprint );
1070 + wg_free( h->fprint_disable );
1071 wg_free( h );
1075 -extern void *textcat_Init( const char *conffile )
1076 +/** Replaces older function */
1077 +extern void *textcat_Init( const char *conffile ){
1078 + return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH );
1081 +/**
1082 + * Originaly this function had only one parameter (conffile) it has been modified since OOo use
1083 + * Basicaly prefix is the directory path where fingerprints are stored
1084 + */
1085 +extern void *special_textcat_Init( const char *conffile, const char *prefix )
1087 textcat_t *h;
1088 char line[1024];
1089 @@ -134,11 +145,13 @@
1090 h->size = 0;
1091 h->maxsize = 16;
1092 h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
1093 + h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/
1095 while ( wg_getline( line, 1024, fp ) ) {
1096 char *p;
1097 char *segment[4];
1098 - int res;
1099 + char finger_print_file_name[512];
1100 + int res;
1102 /*** Skip comments ***/
1103 #ifdef HAVE_STRCHR
1104 @@ -156,17 +169,23 @@
1105 /*** Ensure enough space ***/
1106 if ( h->size == h->maxsize ) {
1107 h->maxsize *= 2;
1108 - h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
1109 + h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
1110 + h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize );
1113 /*** Load data ***/
1114 if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
1115 goto ERROR;
1117 - if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
1118 + finger_print_file_name[0] = '\0';
1119 + strcat(finger_print_file_name, prefix);
1120 + strcat(finger_print_file_name, segment[0]);
1122 + if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) {
1123 textcat_Done(h);
1124 goto ERROR;
1125 - }
1127 + h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/
1128 h->size++;
1131 @@ -203,11 +222,18 @@
1132 result = _TEXTCAT_RESULT_SHORT;
1133 goto READY;
1137 /*** Calculate the score for each category. ***/
1138 for (i=0; i<h->size; i++) {
1139 - int score = fp_Compare( h->fprint[i], unknown, threshold );
1140 - candidates[i].score = score;
1141 + int score;
1142 + if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/
1143 + score = MAXSCORE;
1145 + else{
1146 + score = fp_Compare( h->fprint[i], unknown, threshold );
1147 + /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/
1149 + candidates[i].score = score;
1150 candidates[i].name = fp_Name( h->fprint[i] );
1151 if ( score < minscore ) {
1152 minscore = score;
1153 @@ -218,7 +244,6 @@
1154 /*** Find the best performers ***/
1155 for (i=0; i<h->size; i++) {
1156 if ( candidates[i].score < threshold ) {
1158 if ( ++cnt == MAXCANDIDATES+1 ) {
1159 break;
1161 @@ -235,7 +260,7 @@
1162 else {
1163 char *p = result;
1164 char *plimit = result+MAXOUTPUTSIZE;
1167 qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
1169 *p = '\0';
1170 @@ -247,7 +272,7 @@
1172 READY:
1173 fp_Done(unknown);
1174 -#ifdef SHOULD_FREE
1175 +#ifdef SHOULD_FREE
1176 free(candidates);
1177 #undef SHOULD_FREE
1178 #endif
1179 --- misc/libtextcat-2.2/src/textcat.h Mon May 19 14:16:31 2003
1180 +++ misc/build/libtextcat-2.2/src/textcat.h Mon Mar 31 11:29:14 2008
1181 @@ -40,6 +40,9 @@
1182 #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN"
1183 #define _TEXTCAT_RESULT_SHORT "SHORT"
1185 +#ifdef __cplusplus
1186 +extern "C" {
1187 +#endif
1190 * textcat_Init() - Initialize the text classifier. The textfile
1191 @@ -51,10 +54,19 @@
1192 * Returns: handle on success, NULL on error. (At the moment, the
1193 * only way errors can occur, is when the library cannot read the
1194 * conffile, or one of the fingerprint files listed in it.)
1196 + * Replace older function (and has exacly the same behaviour)
1197 + * see below
1199 extern void *textcat_Init( const char *conffile );
1202 + * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB
1203 + * Basicaly prefix is the directory path where fingerprints are stored
1204 + */
1205 +extern void *special_textcat_Init( const char *conffile, const char *prefix );
1207 +/**
1208 * textcat_Done() - Free up resources for handle
1210 extern void textcat_Done( void *handle );
1211 @@ -77,4 +89,8 @@
1212 * textcat_Version() - Returns a string describing the version of this classifier.
1214 extern char *textcat_Version();
1216 +#ifdef __cplusplus
1218 +#endif
1219 #endif
1220 --- misc/libtextcat-2.2/src/utf8misc.c Mon Mar 31 11:30:06 2008
1221 +++ misc/build/libtextcat-2.2/src/utf8misc.c Mon Mar 31 11:29:14 2008
1222 @@ -1 +1,132 @@
1223 -dummy
1224 +/***************************************************************************
1225 + * Copyright (C) 2006 by Jocelyn Merand *
1226 + * joc.mer@gmail.com *
1227 + * *
1228 + * THE BSD LICENSE
1230 + * Redistribution and use in source and binary forms, with or without
1231 + * modification, are permitted provided that the following conditions
1232 + * are met:
1234 + * - Redistributions of source code must retain the above copyright
1235 + * notice, this list of conditions and the following disclaimer.
1237 + * - Redistributions in binary form must reproduce the above copyright
1238 + * notice, this list of conditions and the following disclaimer in the
1239 + * documentation and/or other materials provided with the
1240 + * distribution.
1242 + * - Neither the name of the WiseGuys Internet B.V. nor the names of
1243 + * its contributors may be used to endorse or promote products derived
1244 + * from this software without specific prior written permission.
1246 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1247 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1248 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1249 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1250 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1251 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
1252 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1253 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
1254 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1255 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1256 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1257 + ***************************************************************************/
1259 +#ifndef _UTF8_MISC_H_
1260 +#include "utf8misc.h"
1261 +#endif
1264 +int nextcharstart(const char *str, int position){
1265 + int pointer = position;
1267 + if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
1269 + /*then str[pointer] is an escape character*/
1271 + char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/
1273 + while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
1274 + escape_char = escape_char <<1;
1275 + ++pointer;
1278 + if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/
1279 + ++pointer;
1281 + return pointer;
1285 +int charcopy(const char *str, char *dest){
1287 + int pointer = 0;
1288 + if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
1290 + /*then str[pointer] is an escape character*/
1292 + char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/
1294 + while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
1295 + dest[pointer] = str[pointer];
1296 + escape_char = escape_char <<1;
1297 + ++pointer;
1300 + if(str[pointer]){
1301 + dest[pointer] = str[pointer];
1302 + ++pointer;
1305 + return pointer;
1309 +int issame( char *lex, char *key, int len )
1311 + /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/
1312 + int char_counter = 0;
1313 + int pointer = 0;
1314 + while(char_counter < len) {
1316 + if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
1318 + /*then key[pointer] is an escap character*/
1320 + char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/
1322 + while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){
1323 + escape_char = escape_char <<1;
1324 + ++pointer;
1327 + ++char_counter; /*and we are on a new utf8 character*/
1328 + if ( key[pointer] != lex[pointer] ) {
1329 + return 0;
1330 + /*printf(" NO\n", lex, key, len);*/
1332 + ++pointer;
1334 + if ( lex[pointer] != '\0' ) {
1335 + return 0;
1336 + /*printf(" NO\n");*/
1339 + /*printf(" YES\n");*/
1341 + return 1;
1345 +extern int utfstrlen(const char* str){
1346 + int char_counter = 0;
1347 + int pointer = 0;
1348 + while(str[pointer]) {
1349 + pointer = nextcharstart(str, pointer);
1351 + ++char_counter; /*and we are on a new utf8 character*/
1353 + return char_counter;
1356 --- misc/libtextcat-2.2/src/utf8misc.h Mon Mar 31 11:30:06 2008
1357 +++ misc/build/libtextcat-2.2/src/utf8misc.h Mon Mar 31 11:29:14 2008
1358 @@ -1 +1,88 @@
1359 -dummy
1360 +/***************************************************************************
1361 + * Copyright (C) 2006 by Jocelyn Merand *
1362 + * joc.mer@gmail.com *
1363 + * *
1364 + * THE BSD LICENSE
1366 + * Redistribution and use in source and binary forms, with or without
1367 + * modification, are permitted provided that the following conditions
1368 + * are met:
1370 + * - Redistributions of source code must retain the above copyright
1371 + * notice, this list of conditions and the following disclaimer.
1373 + * - Redistributions in binary form must reproduce the above copyright
1374 + * notice, this list of conditions and the following disclaimer in the
1375 + * documentation and/or other materials provided with the
1376 + * distribution.
1378 + * - Neither the name of the WiseGuys Internet B.V. nor the names of
1379 + * its contributors may be used to endorse or promote products derived
1380 + * from this software without specific prior written permission.
1382 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1383 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1384 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1385 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1386 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1387 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
1388 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1389 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
1390 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1391 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1392 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1393 + ***************************************************************************/
1395 +#ifndef _UTF8_MISC_H_
1396 +#define _UTF8_MISC_H_
1398 +/**
1399 + * These variables are used in character processing functions
1400 + * These have been added to manage utf-8 symbols, particularly escape chars
1401 + */
1402 +#ifdef _UTF8_
1403 +#define ESCAPE_MASK 0x80
1404 +#define WEIGHT_MASK 0xF0
1405 +#else
1406 +#define ESCAPE_MASK 0xFF
1407 +#define WEIGHT_MASK 0x00
1408 +#endif
1412 + * Is used to jump to the next start of char
1413 + * of course it's only usefull when encoding is utf-8
1414 + * This function have been added by Jocelyn Merand to use libtextcat in OOo
1415 + */
1416 +int nextcharstart(const char *str, int position);
1419 +/*Copy the char in str to dest
1420 + * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char
1421 + * return the number of char jumped
1422 + * This function have been added by Jocelyn Merand to use libtextcat in OOo
1423 + */
1424 +int charcopy(const char *str, char *dest);
1427 +/* checks if n-gram lex is a prefix of key and of length len
1428 +* if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex
1429 +* in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1
1431 +int issame( char *lex, char *key, int len );
1434 +/* Counts the number of characters
1435 +* if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str
1436 +* in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1
1438 +#ifdef __cplusplus
1439 +extern "C" {
1440 +#endif
1441 +extern int utfstrlen(const char* str);
1442 +#ifdef __cplusplus
1444 +#endif
1446 +#endif
1448 --- misc/libtextcat-2.2/src/win32_config.h Mon Mar 31 11:30:06 2008
1449 +++ misc/build/libtextcat-2.2/src/win32_config.h Mon Mar 31 11:29:14 2008
1450 @@ -1 +1,136 @@
1451 -dummy
1452 +/* src/config.h. Generated by configure. */
1453 +/* src/config.h.in. Generated from configure.ac by autoheader. */
1455 +/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
1456 + systems. This function is required for `alloca.c' support on those systems.
1457 + */
1458 +/* #undef CRAY_STACKSEG_END */
1460 +/* Define to 1 if using `alloca.c'. */
1461 +/* #undef C_ALLOCA */
1463 +/* Define to 1 if you have `alloca', as a function or macro. */
1464 +/* #undef HAVE_ALLOCA */
1466 +/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
1467 + */
1468 +/* #undef HAVE_ALLOCA_H */
1470 +/* Define to 1 if you have the <dlfcn.h> header file. */
1471 +#define HAVE_DLFCN_H 1
1473 +/* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */
1474 +/* #undef HAVE_DOPRNT */
1476 +/* Define to 1 if you have the `gettimeofday' function. */
1477 +/* #undef HAVE_GETTIMEOFDAY */
1479 +/* Define to 1 if you have the <inttypes.h> header file. */
1480 +/* #undef HAVE_INTTYPES_H */
1482 +/* Define to 1 if you have the <limits.h> header file. */
1483 +#define HAVE_LIMITS_H 1
1485 +/* Define to 1 if your system has a GNU libc compatible `malloc' function, and
1486 + to 0 otherwise. */
1487 +#define HAVE_MALLOC 1
1489 +/* Define to 1 if you have the <memory.h> header file. */
1490 +#define HAVE_MEMORY_H 1
1492 +/* Define to 1 if you have the `memset' function. */
1493 +#define HAVE_MEMSET 1
1495 +/* Define to 1 if your system has a GNU libc compatible `realloc' function,
1496 + and to 0 otherwise. */
1497 +#define HAVE_REALLOC 1
1499 +/* Define to 1 if you have the <stdint.h> header file. */
1500 +/* #undef HAVE_STDINT_H */
1502 +/* Define to 1 if you have the <stdlib.h> header file. */
1503 +#define HAVE_STDLIB_H 1
1505 +/* Define to 1 if you have the `strchr' function. */
1506 +#define HAVE_STRCHR 1
1508 +/* Define to 1 if you have the `strdup' function. */
1509 +#define HAVE_STRDUP 1
1511 +/* Define to 1 if you have the <strings.h> header file. */
1512 +/* #undef HAVE_STRINGS_H */
1514 +/* Define to 1 if you have the <string.h> header file. */
1515 +#define HAVE_STRING_H 1
1517 +/* Define to 1 if you have the `strpbrk' function. */
1518 +#define HAVE_STRPBRK 1
1520 +/* Define to 1 if you have the <sys/stat.h> header file. */
1521 +#define HAVE_SYS_STAT_H 1
1523 +/* Define to 1 if you have the <sys/time.h> header file. */
1524 +/* #undef HAVE_SYS_TIME_H */
1526 +/* Define to 1 if you have the <sys/types.h> header file. */
1527 +#define HAVE_SYS_TYPES_H 1
1529 +/* Define to 1 if you have the <unistd.h> header file. */
1530 +#define HAVE_UNISTD_H 1
1532 +/* Define to 1 if you have the `vprintf' function. */
1533 +#define HAVE_VPRINTF 1
1535 +/* Name of package */
1536 +#define PACKAGE "libtextcat"
1538 +/* Define to the address where bug reports for this package should be sent. */
1539 +#define PACKAGE_BUGREPORT ""
1541 +/* Define to the full name of this package. */
1542 +#define PACKAGE_NAME "libtextcat"
1544 +/* Define to the full name and version of this package. */
1545 +#define PACKAGE_STRING "libtextcat 2.2"
1547 +/* Define to the one symbol short name of this package. */
1548 +#define PACKAGE_TARNAME "libtextcat"
1550 +/* Define to the version of this package. */
1551 +#define PACKAGE_VERSION "2.2"
1553 +/* If using the C implementation of alloca, define if you know the
1554 + direction of stack growth for your system; otherwise it will be
1555 + automatically deduced at run-time.
1556 + STACK_DIRECTION > 0 => grows toward higher addresses
1557 + STACK_DIRECTION < 0 => grows toward lower addresses
1558 + STACK_DIRECTION = 0 => direction of growth unknown */
1559 +/* #undef STACK_DIRECTION */
1561 +/* Define to 1 if you have the ANSI C header files. */
1562 +#define STDC_HEADERS 1
1564 +/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
1565 +#define TIME_WITH_SYS_TIME 1
1567 +/* Define to 1 if your <sys/time.h> declares `struct tm'. */
1568 +/* #undef TM_IN_SYS_TIME */
1570 +/* Version number of package */
1571 +#define VERSION "2.2"
1573 +/* Define to empty if `const' does not conform to ANSI C. */
1574 +/* #undef const */
1576 +/* Define as `__inline' if that's what the C compiler calls it, or to nothing
1577 + if it is not supported. */
1578 +/* #undef inline */
1580 +/* Define to rpl_malloc if the replacement function should be used. */
1581 +/* #undef malloc */
1583 +/* Define to rpl_realloc if the replacement function should be used. */
1584 +/* #undef realloc */
1586 +/* Define to `unsigned' if <sys/types.h> does not define. */
1587 +/* #undef size_t */