From c062af27bedf06d0113bf6eb9722140b477eb191 Mon Sep 17 00:00:00 2001 From: dbera Date: Wed, 25 Oct 2006 21:53:00 +0000 Subject: [PATCH] configure.in, AssemblyInfo.cs: For those unfortunate earthlings without libchm, libwv or gsf, dont register those filters. NoiseFilter.cs, LuceneCommon.cs: Store interesting tokens from emails and hostnames while indexing, so that parts of hostnames and emails found in the text of a document is searchable. --- Filters/AssemblyInfo.cs | 8 ++++- beagled/AssemblyInfo.cs | 44 ++++++++++++------------- beagled/LuceneCommon.cs | 2 +- beagled/NoiseFilter.cs | 87 ++++++++++++++++++++++++++++++++++++++++++------- configure.in | 15 +++++++-- 5 files changed, 118 insertions(+), 38 deletions(-) diff --git a/Filters/AssemblyInfo.cs b/Filters/AssemblyInfo.cs index 317e856e..4934daed 100644 --- a/Filters/AssemblyInfo.cs +++ b/Filters/AssemblyInfo.cs @@ -34,13 +34,17 @@ using Beagle.Filters; typeof(FilterBMP), typeof(FilterBoo), typeof(FilterC), +#if HAVE_LIBCHM typeof(FilterChm), +#endif typeof(FilterCpp), typeof(FilterCSharp), typeof(FilterDeb), typeof(FilterDesktop), typeof(FilterDirectory), +#if ENABLE_WV1 typeof(FilterDOC), +#endif typeof(FilterDocbook), typeof(FilterEbuild), typeof(FilterExternal), @@ -67,7 +71,9 @@ using Beagle.Filters; typeof(FilterPerl), typeof(FilterPhp), typeof(FilterPng), +#if ENABLE_GSF_SHARP typeof(FilterPPT), +#endif typeof(FilterPython), typeof(FilterRPM), typeof(FilterRTF), @@ -81,4 +87,4 @@ using Beagle.Filters; typeof(FilterTiff), typeof(FilterTotem), typeof(FilterXslt) -)] \ No newline at end of file +)] diff --git a/beagled/AssemblyInfo.cs b/beagled/AssemblyInfo.cs index bf16faf3..3e127ebb 100644 --- a/beagled/AssemblyInfo.cs +++ b/beagled/AssemblyInfo.cs @@ -32,36 +32,36 @@ using Beagle.Daemon; // Any request message types in the BeagleDaemonLib.dll file must be registered // in these two attributes. [assembly: RequestMessageTypes ( - typeof (RemoteIndexerRequest) + typeof (RemoteIndexerRequest) )] [assembly: ResponseMessageTypes ( - typeof (RemoteIndexerResponse) + typeof (RemoteIndexerResponse) )] // Executors go in here. [assembly: RequestMessageExecutorTypes ( - typeof (DaemonInformationExecutor), - typeof (OptimizeIndexesExecutor), - typeof (ReloadConfigExecutor), - typeof (ShutdownExecutor), - typeof (SnippetExecutor) + typeof (DaemonInformationExecutor), + typeof (OptimizeIndexesExecutor), + typeof (ReloadConfigExecutor), + typeof (ShutdownExecutor), + typeof (SnippetExecutor) )] // All backends in this assembly must be registered here. [assembly: IQueryableTypes ( - typeof (Beagle.Daemon.KMailQueryable.KMailQueryable), - typeof (Beagle.Daemon.FileSystemQueryable.FileSystemQueryable), - typeof (Beagle.Daemon.GaimLogQueryable.GaimLogQueryable), - typeof (Beagle.Daemon.IndexingServiceQueryable.IndexingServiceQueryable), - typeof (Beagle.Daemon.TomboyQueryable.TomboyQueryable), - typeof (Beagle.Daemon.LabyrinthQueryable.LabyrinthQueryable), - typeof (Beagle.Daemon.BlamQueryable.BlamQueryable), - typeof (Beagle.Daemon.LifereaQueryable.LifereaQueryable), - typeof (Beagle.Daemon.AkregatorQueryable.AkregatorQueryable), - typeof (Beagle.Daemon.KonqQueryable.KonqQueryable), - typeof (Beagle.Daemon.KNotesQueryable.KNotesQueryable), - typeof (Beagle.Daemon.KabcQueryable.KabcQueryable), - typeof (Beagle.Daemon.KopeteQueryable.KopeteQueryable), - typeof (Beagle.Daemon.StaticQueryable) -)] \ No newline at end of file + typeof (Beagle.Daemon.KMailQueryable.KMailQueryable), + typeof (Beagle.Daemon.FileSystemQueryable.FileSystemQueryable), + typeof (Beagle.Daemon.GaimLogQueryable.GaimLogQueryable), + typeof (Beagle.Daemon.IndexingServiceQueryable.IndexingServiceQueryable), + typeof (Beagle.Daemon.TomboyQueryable.TomboyQueryable), + typeof (Beagle.Daemon.LabyrinthQueryable.LabyrinthQueryable), + typeof (Beagle.Daemon.BlamQueryable.BlamQueryable), + typeof (Beagle.Daemon.LifereaQueryable.LifereaQueryable), + typeof (Beagle.Daemon.AkregatorQueryable.AkregatorQueryable), + typeof (Beagle.Daemon.KonqQueryable.KonqQueryable), + typeof (Beagle.Daemon.KNotesQueryable.KNotesQueryable), + typeof (Beagle.Daemon.KabcQueryable.KabcQueryable), + typeof (Beagle.Daemon.KopeteQueryable.KopeteQueryable), + typeof (Beagle.Daemon.StaticQueryable) +)] diff --git a/beagled/LuceneCommon.cs b/beagled/LuceneCommon.cs index a0b285e8..58488be9 100644 --- a/beagled/LuceneCommon.cs +++ b/beagled/LuceneCommon.cs @@ -428,7 +428,7 @@ namespace Beagle.Daemon { || fieldName == "HotText" || fieldName == "PropertyText" || is_text_prop) { - outstream = new NoiseFilter (outstream); + outstream = new NoiseEmailHostFilter (outstream); outstream = new PorterStemFilter (outstream); } diff --git a/beagled/NoiseFilter.cs b/beagled/NoiseFilter.cs index 8639ff05..9a43f0fd 100644 --- a/beagled/NoiseFilter.cs +++ b/beagled/NoiseFilter.cs @@ -1,6 +1,7 @@ // // NoiseFilter.cs // +// Copyright (C) 2006 Debajyoti Bera // Copyright (C) 2004-2005 Novell, Inc. // @@ -25,20 +26,25 @@ // using System; +using System.Collections; using Lucene.Net.Analysis; using LNSA = Lucene.Net.Analysis.Standard; namespace Beagle.Daemon { - class NoiseFilter : TokenFilter { + // TokenFilter which does several fancy things + // 1. Removes words which are potential noise like dhyhy8ju7q9 + // 2. Splits email addresses into meaningful tokens + // 3. Splits hostnames into subparts + class NoiseEmailHostFilter : TokenFilter { static int total_count = 0; static int noise_count = 0; TokenStream token_stream; - public NoiseFilter (TokenStream input) : base (input) + public NoiseEmailHostFilter (TokenStream input) : base (input) { token_stream = input; } @@ -129,32 +135,53 @@ namespace Beagle.Daemon { private static readonly string tokentype_number = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.NUM]; - private bool IgnoreNoise (Lucene.Net.Analysis.Token token) + private bool ProcessToken (Lucene.Net.Analysis.Token token) { string type = token.Type (); - if (type == tokentype_email || - type == tokentype_host) + if (type == tokentype_email) { + ProcessEmailToken (token); return true; - - if (type == tokentype_number) + } else if (type == tokentype_host) { + ProcessURLToken (token); + return true; + } else if (type == tokentype_number) // nobody will remember more than 10 digits return (token.TermText ().Length <= 10); - - return false; + else + return false; } + private Queue parts = new Queue (); + private Lucene.Net.Analysis.Token token; + public override Lucene.Net.Analysis.Token Next () { - Lucene.Net.Analysis.Token token; + if (parts.Count != 0) { + string part = (string) parts.Dequeue (); + Lucene.Net.Analysis.Token part_token; + // FIXME: Searching for google.com will not match www.google.com. + // If we decide to allow google-style "abcd.1234" which means + // "abcd 1234" as a consequtive phrase, then adjusting + // the startOffset and endOffset would enable matching + // google.com to www.google.com + part_token = new Lucene.Net.Analysis.Token (part, + token.StartOffset (), + token.EndOffset (), + token.Type ()); + part_token.SetPositionIncrement (0); + return part_token; + } + while ( (token = token_stream.Next ()) != null) { + //Console.WriteLine ("Found token: [{0}]", token.TermText ()); #if false if (total_count > 0 && total_count % 5000 == 0) Logger.Log.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)", noise_count, total_count, 100.0 * noise_count / total_count); #endif ++total_count; - if (IgnoreNoise (token)) + if (ProcessToken (token)) return token; if (IsNoise (token.TermText ())) { ++noise_count; @@ -164,6 +191,44 @@ namespace Beagle.Daemon { } return null; } + + char[] replace_array = { '@', '.', '-', '_', '+' }; + private void ProcessEmailToken (Lucene.Net.Analysis.Token token) + { + string email = token.TermText (); + string[] tmp = email.Split (replace_array); + int l = tmp.Length; + + // store username part as a large token + int index_at = email.IndexOf ('@'); + tmp [l-1] = email.Substring (0, index_at); + + foreach (string s in tmp) + parts.Enqueue (s); + + } + + private void ProcessURLToken (Lucene.Net.Analysis.Token token) + { + string hostname = token.TermText (); + string[] host_parts = hostname.Split ('.'); + + // remove initial www + int begin_index = (host_parts [0] == "www" ? 1 : 0); + // remove final tld + // FIXME: any string of form " '.')+" has type HOST + // Removing last token might remove important words from non-host + // string of that form. To fix that, we need match against the + // huge list of TLDs. + int end_index = host_parts.Length - 1; + + if (! Char.IsDigit (host_parts [end_index][0])) + end_index --; + + for (int i = begin_index; i <= end_index; ++i) + parts.Enqueue (host_parts [i]); + + } } diff --git a/configure.in b/configure.in index 8b70bce6..e5fcb1f6 100644 --- a/configure.in +++ b/configure.in @@ -61,6 +61,8 @@ fi # check that we have the require version of mono PKG_CHECK_MODULES(MONO, mono >= $MONO_REQUIRED) +BEAGLE_DEFINES="" + # SharpZipLib broke API in 1.1.11. Route around the damage. if `$PKG_CONFIG --atleast-version=1.1.11 mono`; then SHARPZIPLIB_LIBS="-r:`$PKG_CONFIG --variable=prefix mono`/lib/mono/compat-1.0/ICSharpCode.SharpZipLib.dll" @@ -349,6 +351,9 @@ dnl beagled PKG_CHECK_MODULES(GSF_SHARP, gsf-sharp >= 0.5, enable_gsf_sharp=yes, enable_gsf_sharp=no) AM_CONDITIONAL(ENABLE_GSF_SHARP, test "x$enable_gsf_sharp" = "xyes") +if test "x$enable_gsf_sharp" = "xyes"; then + BEAGLE_DEFINES = "$BEAGLE_DEFINES -define:ENABLE_GSF_SHARP" +fi dnl ---------------------------------------------- @@ -534,6 +539,9 @@ dnl For the wv1 glue PKG_CHECK_MODULES(WV1, wv-1.0, enable_wv1=yes, enable_wv1=no) AM_CONDITIONAL(ENABLE_WV1, test "x$enable_wv1" = "xyes") +if test "x$enable_wv1" = "xyes"; then + BEAGLE_DEFINES = "$BEAGLE_DEFINES -define:ENABLE_WV1" +fi AC_SUBST(WV1_LIBS) dnl ---------------------------------------------- @@ -568,13 +576,14 @@ dnl Check if the system has libchm, used by the CHMFilter # Check for libchm for CHM filter AC_CHECK_LIB(chm,chm_open,has_libchm=yes,has_libchm=no) AM_CONDITIONAL(HAS_LIBCHM, test "x$has_libchm" = "xyes") +if test "x$has_libchm" = "xyes"; then + BEAGLE_DEFINES = "$BEAGLE_DEFINES -define:HAVE_LIBCHM" +fi dnl ---------------------------------------------- dnl Conditional for debugging XML messages. -BEAGLE_DEFINES="" - AC_ARG_ENABLE([xml-dump], AC_HELP_STRING([--enable-xml-dump], [Enables printing of the XML messages sent between components (default no)]), enable_xml_dump=$enableval, @@ -582,7 +591,7 @@ AC_ARG_ENABLE([xml-dump], if test "x$enable_xml_dump" = "xyes"; then AC_DEFINE_UNQUOTED(ENABLE_XML_DUMP, 1, [Dump XML messages for debugging]) - BEAGLE_DEFINES="-define:ENABLE_XML_DUMP" + BEAGLE_DEFINES="$BEAGLE_DEFINES -define:ENABLE_XML_DUMP" fi AC_SUBST(BEAGLE_DEFINES) -- 2.11.4.GIT